How do you correctly pass arguments to a kernel using the Cuda driver api cuLaunchKernel?-CodePudding

I'm creating some device buffers that I'm attempting to pass to a simple kernel using the cuda driver API. I'm creating three device buffers and storing them in std::vector.

std::vector<void *> kernel_arguments;

std::vector<float> a = {2};
std::vector<float> b = {3};

for (auto &input : {a, b}) {
    CUdeviceptr ptr;
    cuMemAlloc(&ptr, input.size()*sizeof(float));
    cuMemcpyHtoD(ptr, input.data(), input.size()*sizeof(float));
    kernel_arguments.push_back(reinterpret_cast<void *> (&ptr));
}

std::vector<float> c(1);

for (auto &output : {c}) {
    CUdeviceptr ptr;
    cuMemAlloc(&ptr, output.size()*sizeof(float));
    kernel_arguments.push_back(reinterpret_cast<void *> (&ptr));
}

CUresult result = cuLaunchKernel(function, 1, 1, 1,
                                 1024, 1, 1, 0, stream,
                                 kernel_arguments.data(), NULL)
const char *error;
cuGetErrorString(result, &error);
std::cout << result << " " << error << std::end;
result = cuStreamSynchronize(stream);
cuGetErrorString(result, &error);
std::cout << result << " " << error << std::end;

The kernel function is a simple addition kernel with three arguments.

__global__ void add_kernel(
    float *i_1,
    float *i_2,
    float *o_3) {
    const size_t index = blockIdx.x*blockDim.x   threadIdx.x;
    if (index < 1) {
        printf("index %d\n", index);
        printf("%p\n", i_1);
        printf("%f\n", *i_1);
        const float r_1 = i_1[index];
        printf("%p\n", i_2);
        printf("%f\n", *i_2);
        const float r_2 = i_2[index];
        const float r_3 = r_1   r_2;
        o_3[index] = r_3;
    }
}

Running this I get the output.

0 no error
index 0
0x14cf4c400200
3.000000
0x14cf4c400200
3.000000
700 an illegal memory access was encountered

Why am I getting the same pointer value for the first and second arguments and why is does it appear that my second device buffer is ending up in the first argument?

CodePudding user response：

This methodology works when you are pushing back a value that is located on the stack, but not when you are pushing back the address of a stack location - that doesn't change from one iteration of the for-loop to the next:

for (auto &input : {a, b}) {
    CUdeviceptr ptr;  // a stack variable
    cuMemAlloc(&ptr, input.size()*sizeof(float));
    cuMemcpyHtoD(ptr, input.data(), input.size()*sizeof(float));
    kernel_arguments.push_back(reinterpret_cast<void *> (&ptr));  //**
}
// ptr is out of scope here

That explains why the first and second parameters both appear to be referencing your 2nd kernel input argument (i_2, 3).

Otherwise when I build a complete code around what you have shown, I don't get any error 700 (however pushing/usage of the address of a stack variable is also going to lead to UB/illegal access once that variable goes out of scope.)

Here is an example (modified from the vectorAddDrv sample code) with a trivial modification to your allocation loop (that is overwriting the stack value at each iteration), to fix that issue:

$ cat vectorAddDrv.cpp
// Includes
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <cstring>
#include <cuda.h>

// includes, project
#include <helper_cuda_drvapi.h>
#include <helper_functions.h>

// includes, CUDA
#include <builtin_types.h>
#include <vector>

using namespace std;

// Variables
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
CUfunction vecAdd_kernel;

//define input fatbin file
#ifndef FATBIN_FILE
#define FATBIN_FILE "vectorAdd_kernel64.fatbin"
#endif

// Host code
int main(int argc, char **argv)
{
    // Initialize
    checkCudaErrors(cuInit(0));

    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
    // Create context
    checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));

    // first search for the module path before we load the results
    string module_path;

    std::ostringstream fatbin;

    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin))
    {
        exit(EXIT_FAILURE);
    }
    else
    {
        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
    }

    if (!fatbin.str().size())
    {
        printf("fatbin file empty. exiting..\n");
        exit(EXIT_FAILURE);
    }

    // Create module from binary file (FATBIN)
    checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));

    // Get function handle from module
    checkCudaErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
    // your code, modified
    std::vector<void *> kernel_arguments;

    std::vector<float> a = {2};
    std::vector<float> b = {3};

    for (auto &input : {a, b}) {
        CUdeviceptr *ptr = new CUdeviceptr;
        cuMemAlloc(ptr, input.size()*sizeof(float));
        cuMemcpyHtoD(*ptr, input.data(), input.size()*sizeof(float));
        kernel_arguments.push_back(ptr);
    }

    std::vector<float> c(1);

    for (auto &output : {c}) {
        CUdeviceptr *ptr = new CUdeviceptr;
        cuMemAlloc(ptr, output.size()*sizeof(float));
        kernel_arguments.push_back(ptr);
    }

    CUresult result = cuLaunchKernel(vecAdd_kernel, 1, 1, 1,
                                 1024, 1, 1, 0, NULL,
                                 kernel_arguments.data(), NULL);
    const char *error;
    cuGetErrorString(result, &error);
    std::cout << result << " " << error << std::endl;
    checkCudaErrors(cuCtxSynchronize());
    cuGetErrorString(result, &error);
    std::cout << result << " " << error << std::endl;
    for (auto &c : kernel_arguments) cuMemFree(*(reinterpret_cast<CUdeviceptr *>(c)));  // this works since all of the kernel arguments in this case happen to be CUdeviceptr

    exit(EXIT_SUCCESS);
}
$ nvcc -I/usr/local/cuda/samples/common/inc  -o test vectorAddDrv.cpp  -lcuda
$ compute-sanitizer ./test
========= COMPUTE-SANITIZER
> Using CUDA Device [0]: Tesla V100-PCIE-32GB
> findModulePath found file at <./vectorAdd_kernel64.fatbin>
> initCUDA loading module: <./vectorAdd_kernel64.fatbin>
0 no error
index 0
0x7f8023c00000
2.000000
0x7f8023c00200
3.000000
0 no error
========= ERROR SUMMARY: 0 errors
$