I have the following buffer:
int nbytes = 256*256*4;
uint8_t buffer_window[nbytes;
I allocate it on host like above. Now I'm using the following for creating it
a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, nbytes, (void*)&buffer_window[0], &_err));
and I enqeue a 2D Kernel normally on 2D framebuffer:
global_size[0] = 256;
global_size[1] = 256;
auto time_start = std::chrono::high_resolution_clock::now();
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
CL_CHECK(clFinish(commandQueue));
In the kernel whenever I write data to that buffer, it executes very slow, but when I remove any writing to that buffer, the kernel executes very fast.
Follow up Kernel
__kernel void sendImageToPBO(__global uchar4* dst_buffer, __global struct Triangle_* triangles, int triCount)
{
size_t blockIdx = get_group_id(0);
size_t blockIdy = get_group_id(1);
size_t blockDimX = get_local_size(0);
size_t blockDimY = get_local_size(1);
size_t threadIdX = get_local_id(0);
size_t threadIdY = get_local_id(1);
int xt = (blockIdx * blockDimX) threadIdX;
int yt = (blockIdy * blockDimY) threadIdY;
int imageWidth = 256;
int imageHeight = 256;
int index = xt (yt * imageWidth);
float3 c0 = { 1, 0, 0 };
float3 c1 = { 0, 1, 0 };
float3 c2 = { 0, 0, 1 };
int x_pos = get_global_id(0);
int y_pos = get_global_id(1);
if (index < triCount)
{
float3 v0Raster = (float3)(triangles[index].v[0].pos[0], triangles[index].v[0].pos[1], 0);
float3 v1Raster = (float3)(triangles[index].v[1].pos[0], triangles[index].v[1].pos[1], 0);
float3 v2Raster = (float3)(triangles[index].v[2].pos[0], triangles[index].v[2].pos[1], 0);
float xmin = min3(v0Raster.x, v1Raster.x, v2Raster.x);
float ymin = min3(v0Raster.y, v1Raster.y, v2Raster.y);
float xmax = max3(v0Raster.x, v1Raster.x, v2Raster.x);
float ymax = max3(v0Raster.y, v1Raster.y, v2Raster.y);
// the triangle is out of screen
if (xmin < imageWidth - 1 || xmax > 0 || ymin < imageHeight - 1 || ymax > 0)
{
// be careful xmin/xmax/ymin/ymax can be negative. Don't cast to uint32_t
unsigned int x0 = max(0, (int)(floor(xmin)));
unsigned int x1 = min((int)(imageWidth)-1, (int)(floor(xmax)));
unsigned int y0 = max(0, (int)(floor(ymin)));
unsigned int y1 = min((int)(imageHeight)-1, (int)(floor(ymax)));
for (unsigned int y = y0; y <= y1; y) {
for (unsigned int x = x0; x <= x1; x) {
float3 p = { x 0.5f, y 0.5f, 0 };
float w0 = edgeFunction(v1Raster, v2Raster, p);
float w1 = edgeFunction(v2Raster, v0Raster, p);
float w2 = edgeFunction(v0Raster, v1Raster, p);
if (w0 >= 0 && w1 >= 0 && w2 >= 0) {
float area = edgeFunction(v0Raster, v1Raster, v2Raster);
float r = w0 * c0.x w1 * c1.x w2 * c2.x;
float g = w0 * c0.y w1 * c1.y w2 * c2.y;
float b = w0 * c0.z w1 * c1.z w2 * c2.z;
w0 /= area;
w1 /= area;
w2 /= area;
float z = 1 / (w0 * v0Raster.z w1 * v1Raster.z w2 * v2Raster.z);
r *= z, g *= z, b *= z;
dst_buffer[y * get_global_size(0) x] = (uchar4)(0, 255, 0, 255);
}
}
}
}
}
CodePudding user response:
If the kernel does not write any data/results to a buffer in global memory, the compiler throws out all the code and you essentially get an empty kernel with zero execution time.
Global range / buffer size in your case is very small. Typically, you want at least several million threads to get full saturation and good performance. Otherwise, initial kernel compile time, PCIe data transfer and and kernel call latency might dominate, rather than kernel execution time.