I have to program the Floyd algorithm using OpenCL, it works fine but only with n<268. when n>=268 i have an "Access violation reading location" when calling clEnqueueWriteBuffer (the buffer_distances one, in the loop).
Here is my code:
graphe is an adjacency matrix, and distances is the distances matrix
int n;
printf("enter n value: ");
scanf("%d", &n);
printf("\n");
int n2 = n * n;
int matSize = n2 * sizeof(int*);
int* graphe = malloc(sizeof(int) * n2);
int* distances = malloc(sizeof(int) * n2);
//mat[i,j] => mat[i*n j]
if (graphe == NULL)
printf("malloc failed\n");
init_graphe(graphe, n);
copy(graphe, distances, n);
initialization of opencl variables:
char* programSource = load_kernel("kernel.cl");
cl_int status;
// STEP 1: Discover and initialize the platforms
cl_uint numPlatforms = 0;
cl_platform_id* platforms = NULL;
status = clGetPlatformIDs(0, NULL, &numPlatforms);
printf("Number of platforms = %d\n", numPlatforms);
platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
char Name[1000];
clGetPlatformInfo(platforms[0], CL_PLATFORM_NAME, sizeof(Name), Name, NULL);
printf("Name of platform : %s\n", Name);
fflush(stdout);
// STEP 2: Discover and initialize the devices
cl_uint numDevices = 0;
cl_device_id* devices = NULL;
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
printf("Number of devices = %d\n", (int)numDevices);
devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
for (int i = 0; i < numDevices; i ) {
clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(Name), Name, NULL);
printf("Name of device %d: %s\n\n", i, Name);
}
// STEP 3: Create a context
fflush(stdout);
cl_context context = NULL;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
// STEP 4: Create a command queue
fflush(stdout);
cl_command_queue cmdQueue;
cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status);
// STEP 5: Create device buffers
fflush(stdout);
cl_mem buffer_graphe;
cl_mem buffer_n;
cl_mem buffer_distances;
cl_mem buffer_k;
buffer_graphe = clCreateBuffer(context, CL_MEM_READ_WRITE, matSize, NULL, &status);
buffer_n = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &status);
buffer_distances = clCreateBuffer(context, CL_MEM_READ_WRITE, matSize, NULL, &status);
buffer_k = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &status);
fflush(stdout);
// STEP 6: Create and compile the program
cl_program program = clCreateProgramWithSource(context, 1, (const char**)&programSource, NULL, &status);
printf("Compilation\n");
fflush(stdout);
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
// STEP 8: Create the kernel
cl_kernel kernel = NULL;
fflush(stdout);
kernel = clCreateKernel(program, "floyd", &status);
size_t globalWorkSize[2] = { n, n };
size_t localWorkSize[3] = { 20,20 };
Execution of the kernel:
clock_t start = clock();
int k;
for (k = 0; k < n; k ) {
status = clEnqueueWriteBuffer(cmdQueue, buffer_graphe, CL_TRUE, 0, matSize, graphe, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buffer_n, CL_TRUE, 0, sizeof(int), &n, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buffer_k, CL_TRUE, 0, sizeof(int), &k, 0, NULL, NULL);
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buffer_graphe);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&buffer_n);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&buffer_distances);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&buffer_k);
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
clFinish(cmdQueue);
status = clEnqueueReadBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
clFinish(cmdQueue);
}
and the kernel:
void kernel floyd(global int* graphe, global int* n, global int* distances, global int* k)
{
int i = get_global_id(0);
int j = get_global_id(1);
int ij = i * (*n) j;
int ik = i * (*n) (*k);
int kj = (*k) * (*n) j;
if (distances[ik] distances[kj] < distances[ij]) {
distances[ij] = distances[ik] distances[kj];
}
}
CodePudding user response:
You have:
int matSize = n2 * sizeof(int*);
…
int* distances = malloc(sizeof(int) * n2);
…
status = clEnqueueWriteBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
- Say n2 is 100.
matSize
will be 800 on a 64-bit system. (sizeof(int*)
= 8)- You allocate 400 bytes of memory for your
distances
array. (sizeof(int)
= 4, typically) - You then copy 800 bytes (
matSize
) fromdistances
into your OpenCL buffer. This overflows the end of the array. Whoops.
The bug is of course the use of sizeof(int*)
: you've got an array of int
s, not an array of pointers, so this should be sizeof(int)
, which is what you're correctly doing in the malloc
call. (I can't quite fathom why you're not using matSize
there.) Although what you should probably be using is CLint
, or one of the explicitly-sized types (int32_t
in this case), because types in OpenCL kernels have very specific definitions which may or may not match those in host C code.
Additional Notes:
- I'm not 100% convinced your data dependencies are safe here. No work-item should be reading an array entry that another is writing in the same kernel enqueueing batch. It seems to me that
ij
(written) for one of the work-items will be equal toik
(read) for the others in the row? Similar deal withij
andkj
. - There's no need to read and re-write the
distances
buffer between iterations, if you're not modifying it on the host. Neither doesgraphe
need re-writing every time if it's not changing. - You can pass scalar arguments such as
k
andn
without a buffer.status = clSetKernelArg(kernel, 1, sizeof(n), &n);
works fine if you change your kernel signature's argument toint n
. (no dereference needed in the kernel then.) - A local work size of 20x20 is likely not optimal. If you're not using local memory or barriers, don't bother setting a local size at al.
- You should be able to remove the
clFinish
calls, and you can change the buffer writes to be non-blocking once you've moved them outside the loop. This might give you an additional small speedup.
CodePudding user response:
I think i found the solution, i replaced malloc by calloc, and now it works.