opencv/modules/ocl/src/kernels/split_mat.cl
Niko 5df77a841e remove redundant OPENCL_DIR flag
remove as much warnings as possible
use enum instead of MACRO for ocl.hpp
add command line parser in accuracy test and perf test
some bug fix for arthim functions
2012-10-22 15:14:22 +08:00

1223 lines
64 KiB
Common Lisp

/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jia Haipeng, jiahaipeng95@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (DOUBLE_SUPPORT)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
///////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////optimized code using vector ////////////////////////////////
////////////vector fuction name format: split_vector_C(channels number)_D(data type depth)//////
////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int src_offset,
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
__global uchar *mat_dst2, int dst2_step, int dst2_offset,
__global uchar *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
x = x << 2;
int src_idx = mad24(y, src_step, src_offset + (x << 2));
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x) & (int)0xfffffffc;
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x) & (int)0xfffffffc;
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x) & (int)0xfffffffc;
int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + x) & (int)0xfffffffc;
uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx)));
uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx)));
uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
int total_bytes = src_offset + rows * src_step;
uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx)));
uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx)));
uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
uchar4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
if((dst0_offset & 3) == 3)
tmp_data0 = (uchar4)(data_0.x, data_1.x, data_2.x, data_3.x);
if((dst0_offset & 3) == 2)
tmp_data0 = (uchar4)(data_1.x, data_2.x, data_3.x, data_4.x);
if((dst0_offset & 3) == 1)
tmp_data0 = (uchar4)(data_2.x, data_3.x, data_4.x, data_5.x);
if((dst0_offset & 3) == 0)
tmp_data0 = (uchar4)(data_3.x, data_4.x, data_5.x, data_6.x);
if((dst1_offset & 3) == 3)
tmp_data1 = (uchar4)(data_0.y, data_1.y, data_2.y, data_3.y);
if((dst1_offset & 3) == 2)
tmp_data1 = (uchar4)(data_1.y, data_2.y, data_3.y, data_4.y);
if((dst1_offset & 3) == 1)
tmp_data1 = (uchar4)(data_2.y, data_3.y, data_4.y, data_5.y);
if((dst1_offset & 3) == 0)
tmp_data1 = (uchar4)(data_3.y, data_4.y, data_5.y, data_6.y);
if((dst2_offset & 3) == 3)
tmp_data2 = (uchar4)(data_0.z, data_1.z, data_2.z, data_3.z);
if((dst2_offset & 3) == 2)
tmp_data2 = (uchar4)(data_1.z, data_2.z, data_3.z, data_4.z);
if((dst2_offset & 3) == 1)
tmp_data2 = (uchar4)(data_2.z, data_3.z, data_4.z, data_5.z);
if((dst2_offset & 3) == 0)
tmp_data2 = (uchar4)(data_3.z, data_4.z, data_5.z, data_6.z);
if((dst3_offset & 3) == 3)
tmp_data3 = (uchar4)(data_0.w, data_1.w, data_2.w, data_3.w);
if((dst3_offset & 3) == 2)
tmp_data3 = (uchar4)(data_1.w, data_2.w, data_3.w, data_4.w);
if((dst3_offset & 3) == 1)
tmp_data3 = (uchar4)(data_2.w, data_3.w, data_4.w, data_5.w);
if((dst3_offset & 3) == 0)
tmp_data3 = (uchar4)(data_3.w, data_4.w, data_5.w, data_6.w);
uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
uchar4 dst2_data = *((__global uchar4 *)(mat_dst2 + dst2_idx));
uchar4 dst3_data = *((__global uchar4 *)(mat_dst3 + dst3_idx));
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
tmp_data3.y = ((dst3_idx + 1 >= dst3_start) && (dst3_idx + 1 < dst3_end)) ? tmp_data3.y : dst3_data.y;
tmp_data3.z = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.z : dst3_data.z;
tmp_data3.w = ((dst3_idx + 3 >= dst3_start) && (dst3_idx + 3 < dst3_end)) ? tmp_data3.w : dst3_data.w;
*((__global uchar4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
*((__global uchar4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
*((__global uchar4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
*((__global uchar4 *)(mat_dst3 + dst3_idx)) = tmp_data3;
}
}
__kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int src_offset,
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
__global uchar *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
x = x << 2;
int src_idx = mad24(y, src_step, src_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
uchar4 dst2_data = *((__global uchar4 *)(mat_dst2 + dst2_idx));
uchar4 tmp_data0, tmp_data1, tmp_data2;
uchar src_data_0 = *(mat_src + src_idx + 3 * x - 9);
uchar src_data_1 = *(mat_src + src_idx + 3 * x - 8);
uchar src_data_2 = *(mat_src + src_idx + 3 * x - 7);
uchar src_data_3 = *(mat_src + src_idx + 3 * x - 6);
uchar src_data_4 = *(mat_src + src_idx + 3 * x - 5);
uchar src_data_5 = *(mat_src + src_idx + 3 * x - 4);
uchar src_data_6 = *(mat_src + src_idx + 3 * x - 3);
uchar src_data_7 = *(mat_src + src_idx + 3 * x - 2);
uchar src_data_8 = *(mat_src + src_idx + 3 * x - 1);
uchar src_data_9 = *(mat_src + src_idx + 3 * x + 0);
uchar src_data_10 = *(mat_src + src_idx + 3 * x + 1);
uchar src_data_11 = *(mat_src + src_idx + 3 * x + 2);
uchar src_data_12 = *(mat_src + src_idx + 3 * x + 3);
uchar src_data_13 = *(mat_src + src_idx + 3 * x + 4);
uchar src_data_14 = *(mat_src + src_idx + 3 * x + 5);
uchar src_data_15 = *(mat_src + src_idx + 3 * x + 6);
uchar src_data_16 = *(mat_src + src_idx + 3 * x + 7);
uchar src_data_17 = *(mat_src + src_idx + 3 * x + 8);
uchar src_data_18 = *(mat_src + src_idx + 3 * x + 9);
uchar src_data_19 = *(mat_src + src_idx + 3 * x + 10);
uchar src_data_20 = *(mat_src + src_idx + 3 * x + 11);
uchar data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
int index = 3 - dst0_offset & 3;
tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
uchar4 data0, data1, data2;
data0 = (uchar4)(src_data_1, src_data_4, src_data_7, src_data_10);
data1 = (dst1_offset & 3) == 2 ? (uchar4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
data2 = (dst1_offset & 3) == 1 ? (uchar4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
tmp_data1 = (dst1_offset & 3) == 0 ? (uchar4)(src_data_10, src_data_13, src_data_16, src_data_19): data2;
data0 = (uchar4)(src_data_2, src_data_5, src_data_8, src_data_11);
data1 = (dst2_offset & 3) == 2 ? (uchar4)(src_data_5, src_data_8, src_data_11, src_data_14) : data0;
data2 = (dst2_offset & 3) == 1 ? (uchar4)(src_data_8, src_data_11, src_data_14, src_data_17) : data1;
tmp_data2 = (dst2_offset & 3) == 0 ? (uchar4)(src_data_11, src_data_14, src_data_17, src_data_20) : data2;
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
*((__global uchar4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
*((__global uchar4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
*((__global uchar4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
}
}
__kernel void split_vector_C2_D0 (__global uchar *mat_src, int src_step, int src_offset,
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
x = x << 2;
#define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1)
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
uchar8 src_data_0 = vload8(0, mat_src + src_idx_0);
uchar8 src_data_1 = vload8(0, mat_src + src_idx_1);
if(src_idx_0 == -6)
src_data_0.s01234567 = src_data_0.s67012345;
if(src_idx_0 == -4)
src_data_0.s01234567 = src_data_0.s45670123;
if(src_idx_0 == -2)
src_data_0.s01234567 = src_data_0.s23456701;
if(src_idx_1 == -6)
src_data_1.s01234567 = src_data_1.s67012345;
if(src_idx_1 == -4)
src_data_1.s01234567 = src_data_1.s45670123;
if(src_idx_1 == -2)
src_data_1.s01234567 = src_data_1.s23456701;
uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
uchar4 tmp_data0, tmp_data1;
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.s0 : dst0_data.x;
tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? src_data_0.s2 : dst0_data.y;
tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.s4 : dst0_data.z;
tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? src_data_0.s6 : dst0_data.w;
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.s1 : dst1_data.x;
tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? src_data_1.s3 : dst1_data.y;
tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.s5 : dst1_data.z;
tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? src_data_1.s7 : dst1_data.w;
*((__global uchar4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
*((__global uchar4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
}
}
__kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int src_offset,
__global char *mat_dst0, int dst0_step, int dst0_offset,
__global char *mat_dst1, int dst1_step, int dst1_offset,
__global char *mat_dst2, int dst2_step, int dst2_offset,
__global char *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
x = x << 2;
int src_idx = mad24(y, src_step, src_offset + (x << 2));
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + x & (int)0xfffffffc);
char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));
char4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
if((dst0_offset & 3) == 3)
tmp_data0 = (char4)(data_0.x, data_1.x, data_2.x, data_3.x);
if((dst0_offset & 3) == 2)
tmp_data0 = (char4)(data_1.x, data_2.x, data_3.x, data_4.x);
if((dst0_offset & 3) == 1)
tmp_data0 = (char4)(data_2.x, data_3.x, data_4.x, data_5.x);
if((dst0_offset & 3) == 0)
tmp_data0 = (char4)(data_3.x, data_4.x, data_5.x, data_6.x);
if((dst1_offset & 3) == 3)
tmp_data1 = (char4)(data_0.y, data_1.y, data_2.y, data_3.y);
if((dst1_offset & 3) == 2)
tmp_data1 = (char4)(data_1.y, data_2.y, data_3.y, data_4.y);
if((dst1_offset & 3) == 1)
tmp_data1 = (char4)(data_2.y, data_3.y, data_4.y, data_5.y);
if((dst1_offset & 3) == 0)
tmp_data1 = (char4)(data_3.y, data_4.y, data_5.y, data_6.y);
if((dst2_offset & 3) == 3)
tmp_data2 = (char4)(data_0.z, data_1.z, data_2.z, data_3.z);
if((dst2_offset & 3) == 2)
tmp_data2 = (char4)(data_1.z, data_2.z, data_3.z, data_4.z);
if((dst2_offset & 3) == 1)
tmp_data2 = (char4)(data_2.z, data_3.z, data_4.z, data_5.z);
if((dst2_offset & 3) == 0)
tmp_data2 = (char4)(data_3.z, data_4.z, data_5.z, data_6.z);
if((dst3_offset & 3) == 3)
tmp_data3 = (char4)(data_0.w, data_1.w, data_2.w, data_3.w);
if((dst3_offset & 3) == 2)
tmp_data3 = (char4)(data_1.w, data_2.w, data_3.w, data_4.w);
if((dst3_offset & 3) == 1)
tmp_data3 = (char4)(data_2.w, data_3.w, data_4.w, data_5.w);
if((dst3_offset & 3) == 0)
tmp_data3 = (char4)(data_3.w, data_4.w, data_5.w, data_6.w);
char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
char4 dst2_data = *((__global char4 *)(mat_dst2 + dst2_idx));
char4 dst3_data = *((__global char4 *)(mat_dst3 + dst3_idx));
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
tmp_data3.y = ((dst3_idx + 1 >= dst3_start) && (dst3_idx + 1 < dst3_end)) ? tmp_data3.y : dst3_data.y;
tmp_data3.z = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.z : dst3_data.z;
tmp_data3.w = ((dst3_idx + 3 >= dst3_start) && (dst3_idx + 3 < dst3_end)) ? tmp_data3.w : dst3_data.w;
*((__global char4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
*((__global char4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
*((__global char4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
*((__global char4 *)(mat_dst3 + dst3_idx)) = tmp_data3;
}
}
__kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int src_offset,
__global char *mat_dst0, int dst0_step, int dst0_offset,
__global char *mat_dst1, int dst1_step, int dst1_offset,
__global char *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
x = x << 2;
int src_idx = mad24(y, src_step, src_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
char4 dst2_data = *((__global char4 *)(mat_dst2 + dst2_idx));
char4 tmp_data0, tmp_data1, tmp_data2;
char src_data_0 = *(mat_src + src_idx + 3 * x - 9);
char src_data_1 = *(mat_src + src_idx + 3 * x - 8);
char src_data_2 = *(mat_src + src_idx + 3 * x - 7);
char src_data_3 = *(mat_src + src_idx + 3 * x - 6);
char src_data_4 = *(mat_src + src_idx + 3 * x - 5);
char src_data_5 = *(mat_src + src_idx + 3 * x - 4);
char src_data_6 = *(mat_src + src_idx + 3 * x - 3);
char src_data_7 = *(mat_src + src_idx + 3 * x - 2);
char src_data_8 = *(mat_src + src_idx + 3 * x - 1);
char src_data_9 = *(mat_src + src_idx + 3 * x + 0);
char src_data_10 = *(mat_src + src_idx + 3 * x + 1);
char src_data_11 = *(mat_src + src_idx + 3 * x + 2);
char src_data_12 = *(mat_src + src_idx + 3 * x + 3);
char src_data_13 = *(mat_src + src_idx + 3 * x + 4);
char src_data_14 = *(mat_src + src_idx + 3 * x + 5);
char src_data_15 = *(mat_src + src_idx + 3 * x + 6);
char src_data_16 = *(mat_src + src_idx + 3 * x + 7);
char src_data_17 = *(mat_src + src_idx + 3 * x + 8);
char src_data_18 = *(mat_src + src_idx + 3 * x + 9);
char src_data_19 = *(mat_src + src_idx + 3 * x + 10);
char src_data_20 = *(mat_src + src_idx + 3 * x + 11);
char data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
int index = 3 - dst0_offset & 3;
tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
char4 data0, data1, data2;
data0 = (char4)(src_data_1, src_data_4, src_data_7, src_data_10);
data1 = (dst1_offset & 3) == 2 ? (char4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
data2 = (dst1_offset & 3) == 1 ? (char4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
tmp_data1 = (dst1_offset & 3) == 0 ? (char4)(src_data_10, src_data_13, src_data_16, src_data_19): data2;
data0 = (char4)(src_data_2, src_data_5, src_data_8, src_data_11);
data1 = (dst2_offset & 3) == 2 ? (char4)(src_data_5, src_data_8, src_data_11, src_data_14) : data0;
data2 = (dst2_offset & 3) == 1 ? (char4)(src_data_8, src_data_11, src_data_14, src_data_17) : data1;
tmp_data2 = (dst2_offset & 3) == 0 ? (char4)(src_data_11, src_data_14, src_data_17, src_data_20) : data2;
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
*((__global char4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
*((__global char4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
*((__global char4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
}
}
__kernel void split_vector_C2_D1 (__global char *mat_src, int src_step, int src_offset,
__global char *mat_dst0, int dst0_step, int dst0_offset,
__global char *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
x = x << 2;
#define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1)
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
char8 src_data_0 = vload8(0, mat_src + src_idx_0);
char8 src_data_1 = vload8(0, mat_src + src_idx_1);
if(src_idx_0 == -6)
src_data_0.s01234567 = src_data_0.s67012345;
if(src_idx_0 == -4)
src_data_0.s01234567 = src_data_0.s45670123;
if(src_idx_0 == -2)
src_data_0.s01234567 = src_data_0.s23456701;
if(src_idx_1 == -6)
src_data_1.s01234567 = src_data_1.s67012345;
if(src_idx_1 == -4)
src_data_1.s01234567 = src_data_1.s45670123;
if(src_idx_1 == -2)
src_data_1.s01234567 = src_data_1.s23456701;
char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
char4 tmp_data0, tmp_data1;
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.s0 : dst0_data.x;
tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? src_data_0.s2 : dst0_data.y;
tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.s4 : dst0_data.z;
tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? src_data_0.s6 : dst0_data.w;
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.s1 : dst1_data.x;
tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? src_data_1.s3 : dst1_data.y;
tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.s5 : dst1_data.z;
tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? src_data_1.s7 : dst1_data.w;
*((__global char4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
*((__global char4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
}
}
__kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int src_offset,
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
__global ushort *mat_dst2, int dst2_step, int dst2_offset,
__global ushort *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
x = x << 1;
int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
ushort8 src_data0 = vload8(0,(__global ushort *)((__global char *)mat_src + src_idx_0));
if(src_idx_0 == -6)
src_data0.s01234567 = src_data0.s67012345;
if(src_idx_0 == -4)
src_data0.s01234567 = src_data0.s45670123;
if(src_idx_0 == -2)
src_data0.s01234567 = src_data0.s23456701;
ushort4 src_data1 = *((__global ushort4 *)((__global char *)mat_src + src_idx_1));
ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
ushort2 dst2_data = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
ushort2 dst3_data = *((__global ushort2 *)((__global char *)mat_dst3 + dst3_idx));
ushort2 tmp_data0, tmp_data1, tmp_data2, tmp_data3;
tmp_data0 = (dst0_offset & 3) == 0 ? (ushort2)(src_data0.s4, src_data1.s0) : (ushort2)(src_data0.s0, src_data0.s4);
tmp_data1 = (dst1_offset & 3) == 0 ? (ushort2)(src_data0.s5, src_data1.s1) : (ushort2)(src_data0.s1, src_data0.s5);
tmp_data2 = (dst2_offset & 3) == 0 ? (ushort2)(src_data0.s6, src_data1.s2) : (ushort2)(src_data0.s2, src_data0.s6);
tmp_data3 = (dst3_offset & 3) == 0 ? (ushort2)(src_data0.s7, src_data1.s3) : (ushort2)(src_data0.s3, src_data0.s7);
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
tmp_data3.y = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.y : dst3_data.y;
*((global ushort2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
*((global ushort2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
*((global ushort2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
*((global ushort2 *)((__global char *)mat_dst3 + dst3_idx)) = tmp_data3;
}
}
__kernel void split_vector_C3_D2 (__global ushort *mat_src, int src_step, int src_offset,
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
__global ushort *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
x = x << 1;
int src_idx = mad24(y, src_step, src_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
ushort2 dst2_data = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
ushort2 tmp_data0, tmp_data1, tmp_data2;
ushort src_data_0 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x - 3];
ushort src_data_1 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x - 2];
ushort src_data_2 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x - 1];
ushort src_data_3 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 0];
ushort src_data_4 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 1];
ushort src_data_5 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 2];
ushort src_data_6 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 3];
ushort src_data_7 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 4];
ushort src_data_8 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 5];
tmp_data0 = (dst0_offset & 3) == 0 ? (ushort2)(src_data_3, src_data_6) : (ushort2)(src_data_0, src_data_3);
tmp_data1 = (dst1_offset & 3) == 0 ? (ushort2)(src_data_4, src_data_7) : (ushort2)(src_data_1, src_data_4);
tmp_data2 = (dst2_offset & 3) == 0 ? (ushort2)(src_data_5, src_data_8) : (ushort2)(src_data_2, src_data_5);
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
*((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
*((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
*((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
}
}
__kernel void split_vector_C2_D2 (__global ushort *mat_src, int src_step, int src_offset,
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
x = x << 1;
#define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1)
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src1_index_fix));
ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src2_index_fix));
if(src_idx_0 < 0)
{
ushort4 tmp;
tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
src_data_0.xyzw = (src_idx_1 == -1) ? src_data_0.wxyz:tmp.xyzw;
}
if(src_idx_1 < 0)
{
ushort4 tmp;
tmp.xyzw = (src_idx_1 == -2) ? src_data_1.zwxy : src_data_1.yzwx;
src_data_1.xyzw = (src_idx_1 == -1) ? src_data_1.wxyz : tmp.xyzw;
}
ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
ushort2 tmp_data0, tmp_data1;
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.x : dst0_data.x;
tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.z : dst0_data.y;
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.y : dst1_data.x;
tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.w : dst1_data.y;
*((global ushort2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
*((global ushort2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
}
}
__kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int src_offset,
__global short *mat_dst0, int dst0_step, int dst0_offset,
__global short *mat_dst1, int dst1_step, int dst1_offset,
__global short *mat_dst2, int dst2_step, int dst2_offset,
__global short *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
x = x << 1;
int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
short8 src_data0 = vload8(0,(__global short *)((__global char *)mat_src + src_idx_0));
if(src_idx_0 == -6)
src_data0.s01234567 = src_data0.s67012345;
if(src_idx_0 == -4)
src_data0.s01234567 = src_data0.s45670123;
if(src_idx_0 == -2)
src_data0.s01234567 = src_data0.s23456701;
short4 src_data1 = *((__global short4 *)((__global char *)mat_src + src_idx_1));
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
short2 dst2_data = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
short2 dst3_data = *((__global short2 *)((__global char *)mat_dst3 + dst3_idx));
short2 tmp_data0, tmp_data1, tmp_data2, tmp_data3;
tmp_data0 = (dst0_offset & 3) == 0 ? (short2)(src_data0.s4, src_data1.s0) : (short2)(src_data0.s0, src_data0.s4);
tmp_data1 = (dst1_offset & 3) == 0 ? (short2)(src_data0.s5, src_data1.s1) : (short2)(src_data0.s1, src_data0.s5);
tmp_data2 = (dst2_offset & 3) == 0 ? (short2)(src_data0.s6, src_data1.s2) : (short2)(src_data0.s2, src_data0.s6);
tmp_data3 = (dst3_offset & 3) == 0 ? (short2)(src_data0.s7, src_data1.s3) : (short2)(src_data0.s3, src_data0.s7);
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
tmp_data3.y = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.y : dst3_data.y;
*((global short2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
*((global short2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
*((global short2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
*((global short2 *)((__global char *)mat_dst3 + dst3_idx)) = tmp_data3;
}
}
__kernel void split_vector_C3_D3 (__global short *mat_src, int src_step, int src_offset,
__global short *mat_dst0, int dst0_step, int dst0_offset,
__global short *mat_dst1, int dst1_step, int dst1_offset,
__global short *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
x = x << 1;
int src_idx = mad24(y, src_step, src_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
short2 dst2_data = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
short2 tmp_data0, tmp_data1, tmp_data2;
short src_data_0 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x - 3];
short src_data_1 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x - 2];
short src_data_2 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x - 1];
short src_data_3 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 0];
short src_data_4 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 1];
short src_data_5 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 2];
short src_data_6 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 3];
short src_data_7 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 4];
short src_data_8 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 5];
tmp_data0 = (dst0_offset & 3) == 0 ? (short2)(src_data_3, src_data_6) : (short2)(src_data_0, src_data_3);
tmp_data1 = (dst1_offset & 3) == 0 ? (short2)(src_data_4, src_data_7) : (short2)(src_data_1, src_data_4);
tmp_data2 = (dst2_offset & 3) == 0 ? (short2)(src_data_5, src_data_8) : (short2)(src_data_2, src_data_5);
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
*((__global short2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
*((__global short2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
*((__global short2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
}
}
__kernel void split_vector_C2_D3 (__global short *mat_src, int src_step, int src_offset,
__global short *mat_dst0, int dst0_step, int dst0_offset,
__global short *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
x = x << 1;
#define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1)
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
short4 src_data_0 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_0));
short4 src_data_1 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_1));
if(src_idx_0 < 0)
{
short4 tmp;
tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
src_data_0.xyzw = (src_idx_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
}
if(src_idx_1< 0)
{
short4 tmp;
tmp.xyzw = ( src_idx_1== -2) ? src_data_1.zwxy : src_data_1.yzwx;
src_data_1.xyzw = ( src_idx_1== -1) ? src_data_1.wxyz : tmp.xyzw;
}
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
short2 tmp_data0, tmp_data1;
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.x : dst0_data.x;
tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.z : dst0_data.y;
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.y : dst1_data.x;
tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.w : dst1_data.y;
*((global short2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
*((global short2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
}
}
__kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src_offset,
__global int *mat_dst0, int dst0_step, int dst0_offset,
__global int *mat_dst1, int dst1_step, int dst1_offset,
__global int *mat_dst2, int dst2_step, int dst2_offset,
__global int *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
int dst3_idx = mad24(y, dst3_step, dst3_offset);
int4 src_data = ((__global int4 *)((__global char *)mat_src + src_idx))[x];
((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
((__global int *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
((__global int *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data.z;
((__global int *)((__global char *)mat_dst3 + dst3_idx))[x] = src_data.w;
}
}
__kernel void split_vector_C3_D4 (__global int *mat_src, int src_step, int src_offset,
__global int *mat_dst0, int dst0_step, int dst0_offset,
__global int *mat_dst1, int dst1_step, int dst1_offset,
__global int *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
int src_data_0 = ((__global int *)((__global char *)mat_src + src_idx))[3 * x + 0];
int src_data_1 = ((__global int *)((__global char *)mat_src + src_idx))[3 * x + 1];
int src_data_2 = ((__global int *)((__global char *)mat_src + src_idx))[3 * x + 2];
((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data_0;
((__global int *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data_1;
((__global int *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data_2;
}
}
__kernel void split_vector_C2_D4 (__global int *mat_src, int src_step, int src_offset,
__global int *mat_dst0, int dst0_step, int dst0_offset,
__global int *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int2 src_data = ((__global int2 *)((__global char *)mat_src + src_idx))[x];
((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
((__global int *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
}
}
__kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int src_offset,
__global float *mat_dst0, int dst0_step, int dst0_offset,
__global float *mat_dst1, int dst1_step, int dst1_offset,
__global float *mat_dst2, int dst2_step, int dst2_offset,
__global float *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
int dst3_idx = mad24(y, dst3_step, dst3_offset);
float4 src_data = ((__global float4 *)((__global char *)mat_src + src_idx))[x];
((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
((__global float *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
((__global float *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data.z;
((__global float *)((__global char *)mat_dst3 + dst3_idx))[x] = src_data.w;
}
}
__kernel void split_vector_C3_D5 (__global float *mat_src, int src_step, int src_offset,
__global float *mat_dst0, int dst0_step, int dst0_offset,
__global float *mat_dst1, int dst1_step, int dst1_offset,
__global float *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
float src_data_0 = ((__global float *)((__global char *)mat_src + src_idx))[3 * x + 0];
float src_data_1 = ((__global float *)((__global char *)mat_src + src_idx))[3 * x + 1];
float src_data_2 = ((__global float *)((__global char *)mat_src + src_idx))[3 * x + 2];
((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data_0;
((__global float *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data_1;
((__global float *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data_2;
}
}
__kernel void split_vector_C2_D5 (__global float *mat_src, int src_step, int src_offset,
__global float *mat_dst0, int dst0_step, int dst0_offset,
__global float *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
float2 src_data = ((__global float2 *)((__global char *)mat_src + src_idx))[x];
((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
((__global float *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int src_offset,
__global double *mat_dst0, int dst0_step, int dst0_offset,
__global double *mat_dst1, int dst1_step, int dst1_offset,
__global double *mat_dst2, int dst2_step, int dst2_offset,
__global double *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
int dst3_idx = mad24(y, dst3_step, dst3_offset);
double4 src_data = ((__global double4 *)((__global char *)mat_src + src_idx))[x];
((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
((__global double *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
((__global double *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data.z;
((__global double *)((__global char *)mat_dst3 + dst3_idx))[x] = src_data.w;
}
}
__kernel void split_vector_C3_D6 (__global double *mat_src, int src_step, int src_offset,
__global double *mat_dst0, int dst0_step, int dst0_offset,
__global double *mat_dst1, int dst1_step, int dst1_offset,
__global double *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
double src_data_0 = ((__global double *)((__global char *)mat_src + src_idx))[3 * x + 0];
double src_data_1 = ((__global double *)((__global char *)mat_src + src_idx))[3 * x + 1];
double src_data_2 = ((__global double *)((__global char *)mat_src + src_idx))[3 * x + 2];
((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data_0;
((__global double *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data_1;
((__global double *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data_2;
}
}
__kernel void split_vector_C2_D6 (__global double *mat_src, int src_step, int src_offset,
__global double *mat_dst0, int dst0_step, int dst0_offset,
__global double *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
{
int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
double2 src_data = ((__global double2 *)((__global char *)mat_src + src_idx))[x];
((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
((__global double *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
}
}
#endif