cuda1.cu
#include <iostream>
using namespace std ;
# define DELLEXPORT extern "C" __declspec(dllexport)
__global__ void kernel(long* answer = 0){
*answer = threadIdx.x (blockIdx.x * blockDim.x);
}
DELLEXPORT void resoult(long* h_answer){
long* d_answer = 0;
cudaMalloc(&d_answer, sizeof(long));
kernel<<<10,1000>>>(d_answer);
cudaMemcpy(&h_answer, d_answer, sizeof(long), cudaMemcpyDeviceToHost);
cudaFree(d_answer);
}
main.py
import ctypes
import numpy as np
add_lib = ctypes.CDLL(".\\a.dll")
resoult= add_lib.resoult
resoult.argtypes = [ctypes.POINTER(ctypes.c_long)]
x = ctypes.c_long()
print("R:",resoult(x))
print("RV: ",x.value)
print("RB: ",resoult(ctypes.byref(x)))
output in python:0
output in cuda: 2096
I implemented based on c language without any problems but in cuda mode I have a problem how can I have the correct output value
Thanks
CodePudding user response:
cudaMemcpy
is expecting pointers for dst
and src
.
In your function resoult
, h_answer
is a pointer to a long
allocated by the caller.
Since it's already the pointer where the data should be copied to, you should use it as is and not take it's address by using &h_answer
.
Therefore you need to change your cudaMemcpy
from:
cudaMemcpy(&h_answer, d_answer, sizeof(long), cudaMemcpyDeviceToHost);
To:
cudaMemcpy(h_answer, d_answer, sizeof(long), cudaMemcpyDeviceToHost);