Home > Net >  Cuda complex object initialization within device: problem with cudaDeviceSetLimit
Cuda complex object initialization within device: problem with cudaDeviceSetLimit

Time:11-27

I am trying to initialize complex objects within my device, within threads and within blocks. It seems to me I have a problem with the cudaDeviceSetLimit. Given my understanding of the problem, I am not setting correctly the heap memory amount per thread. This part of the documentation refers to my problem. But they do not initialize an object. I have also read this post but wasn't able to get my code working.

Edit

Contrary to the first answer: doing this inside the kernel is a must in my problem configuration, because I want to take advantage of initializing the objects across blocks in parallel.

I have made the following toy example which works for a low number of blocks (65) but not for 65535 blocks (the maximum amount of blocks I could use on my device):

class NNode{

    public:

        int node_id;
};

class cuNetwork{

    public:

        int num_allnodes;
        NNode** all_nodes; 

};

__global__ void mallocTest(int num_allnodes, cuNetwork** arr_gpu_net){

    int bId = blockIdx.x; 
    cuNetwork* gpu_net  = new cuNetwork(); 
    gpu_net->all_nodes = new NNode*[num_allnodes];

    for(int i=0; i<num_allnodes; i  ){

            gpu_net->all_nodes[i] = new NNode();
    }

    arr_gpu_net[bId] = gpu_net;

}

int main(int argc, const char **argv){

    int numBlocks = 65; 
    int num_allnodes = 200; 

    cuNetwork** arr_gpu_net = new cuNetwork*[numBlocks];
    cudaMalloc((void **)&arr_gpu_net, sizeof(cuNetwork*) * numBlocks);

    size_t size; 
    //for each block
    size = sizeof(cuNetwork);//new cuNetwork()
    size  = sizeof(NNode*) * num_allnodes;//new NNode*[num_allnodes] 
    size  = sizeof(NNode) * num_allnodes; //for()... new NNode()
    
    //size = sizeof(cuNetwork)   (sizeof(int) * 2   sizeof(NNode)) * num_allnodes;
    cudaDeviceSetLimit(cudaLimitMallocHeapSize, numBlocks * size);
    mallocTest<<<numBlocks, 1>>>(num_allnodes, arr_gpu_net);

    cudaDeviceSynchronize();

    return 0;

}

As soon as I start adding additional properties to the objects, or if I increase numBlocks to 65535, I get the error:

CUDA Exception: Warp Illegal Address
The exception was triggered at PC 0x555555efff90

Thread 1 "no_fun" received signal CUDA_EXCEPTION_14, Warp Illegal Address.
[Switching focus to CUDA kernel 0, grid 1, block (7750,0,0), thread (0,0,0), device 0, sm 1, warp 3, lane 0]
0x0000555555f000b0 in mallocTest(int, cuNetwork**)<<<(65535,1,1),(1,1,1)>>> ()

My question is: in this example, how should I initialize properly cudaDeviceSetLimit in order to have the correct amount of memory needed for the initialization per thread of cuNetwork ?

CodePudding user response:

To answer your question:

Due to memory padding and allocation granularity, each block probably requires more memory than the calculated size. You should always check the return value of new. If it is nullptr, the allocation failed.


However, if the total number of nodes for all networks is known up front, it would be more efficient to just cudaMalloc the memory for all nodes, (and all networks). Then, in the kernel just update the pointers accordingly.

Something like this:

struct cuNetwork2{
    int num_allnodes;
    NNode* all_nodes;
}

__global__ void kernel(cuNetwork2* d_networks, Node* d_nodes, int numNodesPerNetwork){
   int index = ...
   d_networks[index].num_allnodes = numNodesPerNetwork;
   d_networks[index].all_nodes = d_nodes   index * numNodesperNetwork;
}

...

int numBlocks = 65; 
int num_allnodes = 200;

cuNetwork2* d_networks;
NNode* d_nodes;
cudaMalloc(&d_networks, sizeof(cuNetwork2) * numBlocks);
cudaMalloc(&d_nodes, sizeof(NNode) * numBlocks * num_allnodes);

kernel<<<>>>(d_networks, d_nodes, num_allnodes);

In this case, you don't need cudaDeviceSetLimit or in-kernel dynamic allocation.

Here is a simple performance comparison between both approaches.

#include <iostream>
#include <chrono>
#include <cassert>

class NNode{

public:

    int node_id;
};

class cuNetwork{

public:

    int num_allnodes;
    NNode** all_nodes; 

};

__global__ void mallocTest(int num_allnodes, cuNetwork** arr_gpu_net){

    int bId = blockIdx.x; 
    cuNetwork* gpu_net  = new cuNetwork(); 
    gpu_net->all_nodes = new NNode*[num_allnodes];

    for(int i=0; i<num_allnodes; i  ){

            gpu_net->all_nodes[i] = new NNode();
    }

    arr_gpu_net[bId] = gpu_net;

}


__global__ void mallocTestMultiThreadBlock(int num_allnodes, cuNetwork** arr_gpu_net){

    int bId = blockIdx.x; 
    if(threadIdx.x == 0){
        cuNetwork* gpu_net  = new cuNetwork(); 
        assert(gpu_net != nullptr);

        gpu_net->all_nodes = new NNode*[num_allnodes];
        assert(gpu_net->all_nodes != nullptr);

        arr_gpu_net[bId] = gpu_net;
    }
    __syncthreads();

    for(int i=threadIdx.x; i<num_allnodes; i  = blockDim.x){
        arr_gpu_net[bId]->all_nodes[i] = new NNode();
    }


}


struct cuNetwork2{
    int num_allnodes;
    NNode* all_nodes;
};

__global__ 
void pointerassignmentkernel(cuNetwork2* d_networks, NNode* d_nodes, int numNodesPerNetwork){
    int bId = blockIdx.x; 
    d_networks[bId].num_allnodes = numNodesPerNetwork;
    d_networks[bId].all_nodes = d_nodes   bId * numNodesPerNetwork;
}

__global__ 
void nodeinitkernel(NNode* d_nodes, int totalNumNodes){
    const int id = threadIdx.x   blockIdx.x * blockDim.x;
    if(id < totalNumNodes){
        new (&d_nodes[id]) NNode();
    }
}

int main(int argc, const char **argv){

    int numBlocks = 64; 
    int num_allnodes = 200; 

    cuNetwork** arr_gpu_net = new cuNetwork*[numBlocks];
    cudaMalloc((void **)&arr_gpu_net, sizeof(cuNetwork*) * numBlocks);

    size_t size; 
    //for each block
    size = sizeof(cuNetwork);//new cuNetwork()
    size  = sizeof(NNode*) * num_allnodes;//new NNode*[num_allnodes] 
    size  = sizeof(NNode) * num_allnodes; //for()... new NNode()

    //size = sizeof(cuNetwork)   (sizeof(int) * 2   sizeof(NNode)) * num_allnodes;
    
    cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1ull * 1024ull * 1024ull * 1024ull); // I set this to 1GB which did not cause errors for the given problem size

    std::chrono::time_point<std::chrono::system_clock> timeA = std::chrono::system_clock::now();
    mallocTest<<<numBlocks, 1>>>(num_allnodes, arr_gpu_net);
    //mallocTestMultiThreadBlock<<<numBlocks, num_allnodes>>>(num_allnodes, arr_gpu_net);

    cudaError_t status = cudaDeviceSynchronize();
    assert(status == cudaSuccess);
    std::chrono::time_point<std::chrono::system_clock> timeB = std::chrono::system_clock::now();

    std::cerr << "mallocTest kernel: " << std::chrono::duration<double>(timeB - timeA).count() << "s\n";


    


    std::chrono::time_point<std::chrono::system_clock> timeC = std::chrono::system_clock::now();
    cuNetwork2* d_networks;
    NNode* d_nodes;
    cudaMalloc(&d_networks, sizeof(cuNetwork2) * numBlocks);
    cudaMalloc(&d_nodes, sizeof(NNode) * numBlocks * num_allnodes);
    std::chrono::time_point<std::chrono::system_clock> timeD = std::chrono::system_clock::now();
    
    std::cerr << "host cudaMalloc: " << std::chrono::duration<double>(timeD - timeC).count() << "s\n";
    
    pointerassignmentkernel<<<numBlocks, 1>>>(d_networks, d_nodes, num_allnodes);
    
    status = cudaDeviceSynchronize();
    assert(status == cudaSuccess);
    std::chrono::time_point<std::chrono::system_clock> timeE = std::chrono::system_clock::now();
    
    std::cerr << "pointerassignmentkernel: " << std::chrono::duration<double>(timeE - timeD).count() << "s\n";   

    nodeinitkernel<<<(numBlocks * num_allnodes   128 - 1) / 128, 128>>>(d_nodes, numBlocks * num_allnodes);
    status = cudaDeviceSynchronize();
    assert(status == cudaSuccess);
    std::chrono::time_point<std::chrono::system_clock> timeF = std::chrono::system_clock::now();
    
    std::cerr << "nodeinitkernel: " << std::chrono::duration<double>(timeF - timeE).count() << "s\n"; 
    
    cudaDeviceReset();

    return 0;

}
Compiled with: nvcc -arch=sm_61 -O3 kernelallocation.cu -o kernelallocation
mallocTest kernel: 0.0183772s
host cudaMalloc: 5.02e-06s
pointerassignmentkernel: 1.2739e-05s
nodeinitkernel: 1.213e-05s
  • Related