Home > Enterprise >  OpenCL CL_INVALID_VALUE Error on clEnqueueWriteBuffer
OpenCL CL_INVALID_VALUE Error on clEnqueueWriteBuffer

Time:04-29

I am trying to make an Algorithm run on OpenCL. I am using this repository (Source.cpp) as a template. I now want to convert the whole program into type of long algorithm instead of float. But I always get an CL_INVALID_VALUE (-30) exception at the second clEnqueueWriteBuffer. I have wasted hours without finding the error, so maybe I have overseen something obvious (I have not done too much with opencl yet..) ?

My code (not working)

#include <cassert>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include<time.h>
#include <CL/cl.h>

//#define DATA_SIZE 1024
#define DATA_SIZE 1024

using namespace std;


//$ /f/Tools/OCL_SDK_Light/lib/x86_64/opencl.lib blelloch_scan.cpp
const char* ProgramSource =
"__kernel void add(__global long *input, __global long *output, __global long *temp, int size){\n"\
"int thid = get_global_id(0); \n"\
"int offset = 1; \n"\
"printf('%d',thid); \n"\
"temp[2*thid] = input[2*thid]; \n"\
"temp[2*thid 1] = input[2*thid 1]; \n"\
"for(int d= size>>1; d>0; d >>= 1){ \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d){ \n"\
"int ai = offset*(2*thid   1)-1; \n"\
"int bi = offset*(2*thid   2)-1; \n"\
"temp[bi]  = temp[ai]; } \n"\
"offset = offset*2; \n"\
"} \n"\
"temp[size-1] = 0; \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"for(int d = 1; d<size; d *= 2){ \n"\
"offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d) { \n"\
"int ai = offset*(2*thid 1)-1; int bi = offset*(2*thid 2)-1; \n"\
"long t = temp[ai]; temp[ai] = temp[bi]; temp[bi]  = t; }  \n"\
"} \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"output[2*thid] = temp[2*thid]; \n"\
"output[2*thid 1] = temp[2*thid 1]; \n"\
"}\n"\
"\n";
/*
*/




int main(void)
{
    cl_context context;
    cl_context_properties properties[3];

    cl_command_queue command_queue;
    cl_kernel kernel;
    cl_program program;
    cl_int err;
    cl_uint num_platforms = 0;
    cl_platform_id* platforms;
    cl_device_id device_id;
    cl_uint num_of_devices = 0;
    cl_mem inputA, inputB, output;
    size_t global, loc;
    std::cout << "Setup \n";

    long arr[DATA_SIZE];
    long inputDataA[DATA_SIZE];
    long results[2 * DATA_SIZE];
    long  i;
    for (i = 1; i < DATA_SIZE - 1;i  )
    {
        inputDataA[i-1] = (long)i;
        arr[i-1] = (long)i;
    }
    clock_t ends;

    /* --------------------- Get platform ---------------------*/
    cl_int clResult = clGetPlatformIDs(0, NULL, &num_platforms);
    assert(clResult == CL_SUCCESS);

    platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
    clResult = clGetPlatformIDs(num_platforms, platforms, NULL);
    assert(clResult == CL_SUCCESS);
    /* --------------------- ------------ ---------------------*/


    /* --------------------- Get devices ---------------------*/
    cl_device_id* devices = NULL;
    cl_uint num_devices;

    clResult = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
    assert(clResult == CL_SUCCESS);

    devices = (cl_device_id*)malloc(sizeof(cl_device_id) * num_platforms);

    if (clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL) != CL_SUCCESS)
    {
        printf("could not find device id");
    }
    assert(clResult == CL_SUCCESS);
    /* --------------------- ----------- ---------------------*/
    properties[0] = CL_CONTEXT_PLATFORM;
    properties[1] = 0;
    cl_int contextResult;
    context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &contextResult);
    assert(contextResult == CL_SUCCESS);
    // create command queue using the context and device
    command_queue = clCreateCommandQueueWithProperties(context, devices[0], 0, &err);

    // create a program from the kernel source code
    program = clCreateProgramWithSource(context, 1, (const char**)&ProgramSource, NULL, &err);

    // compile the program
    if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
    {
        printf("Error building program\n");
        return 1;
    }

    // specify which kernel from the program to execute
    kernel = clCreateKernel(program, "add", &err);

    // create buffers for the input and ouput
    cl_int result;
    inputA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
    inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);

    // load data into the input buffer
    clResult = clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataA, 0, NULL, NULL);
    assert(clResult == CL_SUCCESS);
    clResult = clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(long) * DATA_SIZE , 0, 0, NULL, NULL);
    assert(clResult == CL_SUCCESS); // ERROR HERE 
    clResult = clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, 0, 0, NULL, NULL);
    assert(clResult == CL_SUCCESS);


    int temp = DATA_SIZE;

    clock_t start = clock();

    // set the argument list for the kernel command
    clResult = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
    assert(clResult == CL_SUCCESS);


    clResult = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    assert(clResult == CL_SUCCESS);


    clResult = clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);
    assert(clResult == CL_SUCCESS);


    clResult = clSetKernelArg(kernel, 3, sizeof(int), &temp);
    assert(clResult == CL_SUCCESS);


    global = DATA_SIZE; // num of processors
    loc = 256;

    printf("\n>> start parallel ---------- \n");
    // enqueue the kernel command for execution
    clResult = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &loc, 0, NULL, NULL);
    assert(clResult == CL_SUCCESS);

    // copy the results from out of the output buffer
    clResult = clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, results, 0, NULL, NULL);
    assert(clResult == CL_SUCCESS);


    clFinish(command_queue);
    ends = clock();

    // print the results
    int k = 1;
    for (k = 1;k < 8; k  )
    {
        printf("%d - ", k);
        printf("%d \n", results[k]);
    }
    double time_taken = ((double)(ends - start)) / CLK_TCK;
    printf("\n>>finished parallel in %lf seconds\n", time_taken);

    // cleanup - release OpenCL resources

    printf("\n-------------------------------------\n");


    /* -------sequential ------- */
    printf("\n>> start sequential ---------- \n");
    long prefixSum[DATA_SIZE] = { 0 };

    const clock_t startSequential = clock();

    prefixSum[0] = arr[0];
    long idx = 1;

    for (idx = 1; idx < DATA_SIZE; idx  ) {
        prefixSum[idx] = prefixSum[idx - 1]   arr[idx];
    }
    const clock_t endSequential = clock();




    double seqTime = ((double)(endSequential - startSequential)) / CLK_TCK;

    printf("\n>> finished sequential in %lf\n", seqTime);
    for (int j = 0;j < 8; j  )
    {
        printf("%d - ", j);
        printf("%d \n", prefixSum[j]);
    }


    clReleaseMemObject(inputA);
    clReleaseMemObject(inputB);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(command_queue);
    clReleaseContext(context);

    return 0;
}

The repository code (working):

#include <cassert>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include<time.h>

#include <CL/cl.h>

#define DATA_SIZE 1024
using namespace std;
ofstream outfile;


const char* ProgramSource =
"__kernel void add(__global float *input, __global float *output, __global float *temp, int size){\n"\
"int thid = get_global_id(0); \n"\
"int offset = 1; \n"\
"temp[2*thid] = input[2*thid]; \n"\
"temp[2*thid 1] = input[2*thid 1]; \n"\
"for(int d= size>>1; d>0; d >>= 1){ \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d){ \n"\
"int ai = offset*(2*thid   1)-1; \n"\
"int bi = offset*(2*thid   2)-1; \n"\
"temp[bi]  = temp[ai]; } \n"\
"offset = offset*2; \n"\
"} \n"\
"temp[size-1] = 0; \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"for(int d = 1; d<size; d *= 2){ \n"\
"offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d) { \n"\
"int ai = offset*(2*thid 1)-1; int bi = offset*(2*thid 2)-1; \n"\
"float t = temp[ai]; temp[ai] = temp[bi]; temp[bi]  = t; }  \n"\
"} \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"output[2*thid] = temp[2*thid]; \n"\
"output[2*thid 1] = temp[2*thid 1]; \n"\
"}\n"\
"\n";
/*
*/



int main(void)
{
    cl_uint num_platforms = 0;

    cl_context context;
    cl_context_properties properties[3];
    cl_kernel kernel;
    cl_platform_id* platforms;
    cl_command_queue command_queue;
    cl_program program;
    cl_int err;
    cl_uint num_of_platforms = 0;
    cl_platform_id platform_id;
    cl_device_id device_id;
    cl_uint num_of_devices = 0;
    cl_mem inputA, inputB, output;
    outfile.open("shubham.txt");
    size_t global, loc;

    float inputDataA[DATA_SIZE];
    float results[2 * DATA_SIZE] = { 0 };

    int i;
    for (i = 0; i < DATA_SIZE;i  )
    {
        inputDataA[i] = (float)i;
    }
    clock_t start, ends;

    /* --------------------- Get platform ---------------------*/
    cl_int clResult = clGetPlatformIDs(0, NULL, &num_platforms);
    assert(clResult == CL_SUCCESS);

    platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
    clResult = clGetPlatformIDs(num_platforms, platforms, NULL);
    assert(clResult == CL_SUCCESS);
    /* --------------------- ------------ ---------------------*/


    /* --------------------- Get devices ---------------------*/
    cl_device_id* devices = NULL;
    cl_uint num_devices;

    clResult = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
    assert(clResult == CL_SUCCESS);

    devices = (cl_device_id*)malloc(sizeof(cl_device_id) * num_platforms);

    if (clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL) != CL_SUCCESS)
    {
        printf("could not find device id");
    }
    assert(clResult == CL_SUCCESS);
    /* --------------------- ----------- ---------------------*/
    properties[0] = CL_CONTEXT_PLATFORM;
    properties[1] = 0;
    cl_int contextResult;
    context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &contextResult);
    assert(contextResult == CL_SUCCESS);
    // create command queue using the context and device

    // create command queue using the context and device
    command_queue = clCreateCommandQueueWithProperties(context, devices[0], 0, &err);

    // create a program from the kernel source code
    program = clCreateProgramWithSource(context, 1, (const char**)&ProgramSource, NULL, &err);

    // compile the program
    if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
    {
        printf("Error building program\n");
        return 1;
    }

    // specify which kernel from the program to execute
    kernel = clCreateKernel(program, "add", &err);

    // create buffers for the input and ouput

    inputA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
    inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);

    // load data into the input buffer
    clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL);
    clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);
    clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);

    int temp = DATA_SIZE;

    start = clock();

    // set the argument list for the kernel command
    clResult = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
    assert(clResult == CL_SUCCESS);

    clResult = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    assert(clResult == CL_SUCCESS);

    clResult = clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);
    assert(clResult == CL_SUCCESS);
    clResult = clSetKernelArg(kernel, 3, sizeof(int), &temp);
    assert(clResult == CL_SUCCESS);

    global = DATA_SIZE;
    loc = 256;
    // enqueue the kernel command for execution
    clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &loc, 0, NULL, NULL);
    clFinish(command_queue);

    // copy the results from out of the output buffer
    clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, results, 0, NULL, NULL);
    //clEnqueueReadBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) *16, shubh, 0, NULL, NULL);

    // print the results
    printf("output: ");

    for (i = 0;i < 5; i  )
    {
        printf("%f \n", results[i]);
        outfile << results[i] << " ";
    }
    ends = clock();
    double time_taken = ((double)(ends - start)) / CLK_TCK;
    outfile << endl << "Time taken is : " << time_taken << endl;

    clReleaseMemObject(inputA);
    clReleaseMemObject(inputB);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(command_queue);
    clReleaseContext(context);
    return 0;
}

Thanks in Advance

CodePudding user response:

I found your mistake. For me, it first wouldn't compile the OpenCL C code, so I debugged with

char info[1024];
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 1024*sizeof(char), (void*)info, NULL); // print build log
printf(info);

to get the build log:

<kernel>:4:8: warning: multi-character character constant
printf('0',thid);
       ^
<kernel>:4:8: warning: incompatible integer to pointer conversion passing 'int' to parameter of type '__constant char *'
printf('190',thid);
       ^~~~
cl_kernel.h:4694:32: note: passing argument to parameter here
printf(constant char * restrict, ...) __asm("llvm.nvvm.internal.printf.cl");

Seems ' instead of \" was the issue. Change this line:

"printf(\"%d\",thid); \n"

Then the OpenCL C code compiles and I can reproduce the CL_INVALID_VALUE error.

Here is the issue: You use clEnqueueWriteBuffer to copy data from inputB to 0. You need to add C arrays to copy the data into:

long inputDataA[DATA_SIZE];
long inputDataB[DATA_SIZE];
long outputData[DATA_SIZE];

and

clResult = clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataA, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
clResult = clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataB, 0, NULL, NULL);
assert(clResult == CL_SUCCESS); // WORKS NOW
clResult = clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, outputData, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);

Then it works, and I get this output:

Setup
0

>> start parallel ----------
25625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035101234567891011121314151617181920212223242526272829303138438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441564656667686970717273747576777879808182838485868788899091929394953523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823839697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612741641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863932333435363738394041424344454647484950515253545556575859606162634484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784798328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628635445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745751921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222234804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105118648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948955125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425431281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581598008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308315765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066072242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542559609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909916406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706711601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901917687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987997367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667679929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210236726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027039289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589597047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347358968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269271 - 0
2 - 1
3 - 2
4 - 4
5 - 6
6 - 9
7 - 12

>>finished parallel in 0.023000 seconds

-------------------------------------

>> start sequential ----------

>> finished sequential in 0.000000
0 - 1
1 - 3
2 - 6
3 - 10
4 - 15
5 - 21
6 - 28
7 - 36

Also note, that in OpenCL C, long = 64-bit integer, but in C , long = "at least" 32-bit integer, for whatever stupid reason. In C you shoud use long long int as this is alwqays 64-bit integer. You can use for example typedef int64_t slong;, where int64_t itself is a typedef of long long int.

Another issue is that the program is not deterministic. When executed multiple times, I get a different result each time. There must be some race condition present. I suppose you wrongly assume that barrier(CLK_GLOBAL_MEM_FENCE); provides global synchronization of all threads, but this is not true. The only global synchronization is to split the kernel into multiple kernels at the desired synchronizatipon points and execute them one after the other.


Finally, to make OpenCL development in C much easier, and to prevent wasting hours on such simple errors, I have created a lightweight OpenCL-Wrapper to eliminate all of the OpenCL code overhead. With this, your code is 4x shorter and much easier to understand:

#include "opencl.hpp"

#define DATA_SIZE 1024

int main() {
    Clock clock;
    clock.start();

    Device device(select_device_with_most_flops()); // compile OpenCL C code for the fastest available device

    Memory<slong> arr(device, DATA_SIZE, 1u, true, false);
    Memory<slong> inputA(device, DATA_SIZE);
    Memory<slong> inputB(device, DATA_SIZE);
    Memory<slong> output(device, DATA_SIZE);

    for(int i=1; i<DATA_SIZE-1; i  ) {
        inputA[i-1] = (slong)i;
        arr[i-1] = (slong)i;
    }
    inputA.write_to_device();
    
    Kernel kernel(device, DATA_SIZE, "add", inputA, output, inputB);
    kernel.add_constants(DATA_SIZE);

    kernel.run();
    output.read_from_device();

    double time_taken = clock.stop();

    // print the results
    for(int k=1; k<8; k  ) {
        printf("%d - ", k);
        printf("%d \n", output[k]);
    }
    printf("\n>>finished parallel in %lf seconds\n", time_taken);
    printf("\n-------------------------------------\n");
    printf("\n>> start sequential ---------- \n");
    long prefixSum[DATA_SIZE] = { 0 };

    clock.start();
    prefixSum[0] = arr[0];
    for(long idx=1; idx<DATA_SIZE; idx  ) {
        prefixSum[idx] = prefixSum[idx-1] arr[idx];
    }
    double seqTime = clock.stop();

    printf("\n>> finished sequential in %lf\n", seqTime);
    for(int j=0; j<8; j  ) {
        printf("%d - ", j);
        printf("%d \n", prefixSum[j]);
    }

    wait();
    return 0;
}
#include "kernel.hpp" // note: unbalanced round brackets () are not allowed and string literals can't be arbitrarily long, so periodically interrupt with ) R(
string opencl_c_container() { return R( // ########################## begin of OpenCL C code ####################################################################

kernel void add(__global long* input, __global long* output, __global long* temp, int size) {
    int thid = get_global_id(0);
    int offset = 1;
    printf("%d",thid);
    temp[2*thid] = input[2*thid];
    temp[2*thid 1] = input[2*thid 1];
    for(int d= size>>1; d>0; d >>= 1) {
        barrier(CLK_GLOBAL_MEM_FENCE);
        if(thid < d) {
            int ai = offset*(2*thid   1)-1;
            int bi = offset*(2*thid   2)-1;
            temp[bi]  = temp[ai];
        }
        offset = offset*2;
    }
    temp[size-1] = 0;
    barrier(CLK_GLOBAL_MEM_FENCE);
    for(int d = 1; d<size; d *= 2) {
        offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE);
        if(thid < d) {
            int ai = offset*(2*thid 1)-1; int bi = offset*(2*thid 2)-1;
            long t = temp[ai]; temp[ai] = temp[bi]; temp[bi]  = t;
        }
    }
    barrier(CLK_GLOBAL_MEM_FENCE);
    output[2*thid] = temp[2*thid];
    output[2*thid 1] = temp[2*thid 1];
}

);} // ############################################################### end of OpenCL C code #####################################################################
  • Related