I'm new to C and especially the Cuda computing tool. I'm trying to parallelize my code on the GPU using Cuda and for my purposes it is necessary to be able to create objects from a class or at least to call their members from the kernel.
First here is the definition of my class:
//Header
#pragma once
#include<cuda.h>
#include<cuda_runtime.h>
#include<cuda_runtime_api.h>
#include<stdio.h>
#include<iostream>
class Ding
{
private:
int index;
double vector_1[100];
double vector_2[100];
double prop;
public:
__host__ __device__ Ding(int);
__host__ __device__ ~Ding();
__device__ void calculate_stuff(double, int);
__device__ double get_prop();
};
//Source
#include "Ding.h"
#include <math.h>
__host__ __device__ Ding::Ding(int ind) {
index = ind;
prop = 1;
for (int ii = 0; ii < 100; ii ) {
vector_1[ii] = (4 * ii ind) / (ind ii 1);
vector_2[ii] = (-2.14 * ii ind) / (2*ind ii 1);
}
}
__host__ __device__ Ding::~Ding() {};
__device__ void Ding::calculate_stuff(double coeff, int N) {
prop = 1;
for (int ii = 0; ii < N; ii ) {
for (int jj = 0; jj < 100; jj ) {
prop = pow(-1, ii) * vector_1[jj] * vector_2[jj]*coeff;
}
}
}
__device__ double Ding::get_prop() {
return prop;
}
As you can see there's nothing much to it but a number of meaningless calculations carried out as this should only serve as an example for me how to get this code run with Cuda.
Now here is the main source file:
#include<cuda.h>
#include<cuda_runtime.h>
#include<cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include<stdio.h>
#include<iostream>
#include "Ding.h"
using namespace std;
__global__ void some_kernel(double *vec_a, Ding* teil, int size) {
int ii = blockIdx.x * blockDim.x threadIdx.x;
if (ii < size) {
vec_a[ii] = teil[ii].get_prop();
vec_a[ii] = ii;
}
}
int main() {
double* vec_1, * d_vec_1;
int N = 300;
double result = 0;
Ding* teil;
Ding* d_teil;
vec_1 = (double*)malloc(N * sizeof(double));
teil = (Ding*)malloc(N * sizeof(Ding));
for (int ii = 0; ii < N; ii ) {
vec_1[ii] = 0;
teil[ii] = Ding::Ding(ii);
}
cudaMalloc(&d_vec_1, N * sizeof(double));
cudaMalloc(&d_teil, N * sizeof(Ding));
cudaMemcpy(d_vec_1, vec_1, N * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_teil, teil, N * sizeof(Ding), cudaMemcpyHostToDevice);
some_kernel <<< 256/N 1, 256 >>> (d_vec_1, d_teil, N);
for (int ii = 0; ii < N; ii ) {
result = vec_1[ii];
}
cout << "Old result: " << result << endl;
cudaMemcpy(vec_1, d_vec_1, N * sizeof(double), cudaMemcpyDeviceToHost);
result = 0;
for (int ii = 0; ii < N; ii ) {
result = vec_1[ii];
}
cout << "New result: " << result << endl;
}
I created an array of objects from the host and copied it to the device. On the device only the getter for the parameter "prop" is called and the value is added to the vector vec_a. So basically the code works when no class member is called from the device. So if I comment out the line:
vec_a[ii] = teil[ii].get_prop();
the code works but as soon as any class members come into play I get the following error that unfortunately I couldn't make any sense of:
Fehler MSB3721 Der Befehl ""C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\nvcc.exe" -gencode=arch=compute_52,code=sm_52 --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30133\bin\HostX86\x64" -x cu -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\include" -G --keep-dir x64\Debug -maxrregcount=0 --machine 64 -cuda -cudart static -g -D_DEBUG -D_CONSOLE -D_UNICODE -DUNICODE -Xcompiler "/EHsc /W3 /nologo /Od /Fdx64\Debug\vc142.pdb /FS /Zi /RTC1 /MDd " -o x64\Debug\File.cu.obj "C:\Users\ronal\source\repos\Basic_Cuda_Test\Basic_Cuda_Test\File.cu"" wurde mit Code 255 beendet. Basic_Cuda_Test C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations\CUDA 11.4.targets 785
As it can be seen from this message I'm using Cuda v11.4 and Microsoft Visual Studio 2019. Without accessing class members the calculation is carried out successfully so the Cuda stuff seems to works in general. That's why I assume that aeverything is installed and configured properly so far. The best would be if I could also create objects on the device but for now I would be very happy if I could make use of class members on the device somehow. Probably the problem is pretty basic.
I'm looking forward to your answers and solutions.
CodePudding user response:
Judging from the code given, I assume you have some file Ding.cu
which contains the implementation of Ding
.
To be able to call a device function from a different compilation unit (the kernel is implemented in main.cu
) relocatable device code has to be generated. Use the compiler flag --rdc=true