Python cannot receive the result from cuda ctypes-CodePudding

cuda script

#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include<iostream>
using namespace std;

# define DELLEXPORT extern "C" __declspec(dllexport)


__global__ void MatAdd (const double *cellbox, const double *verticess, double *C,
                    const double *halfcells,int rows, int cols)
{


    int cellboxidy = threadIdx.y;
    int verticessidx = threadIdx.x;

    const double mincell [3] = {cellbox[cellboxidy * 3] - 
halfcells[0],cellbox[cellboxidy * 3   1] - halfcells[1],cellbox[cellboxidy * 3   2] - 
halfcells[2]};
    const double maxcell [3] = {cellbox[cellboxidy * 3]   
halfcells[0],cellbox[cellboxidy * 3   1]   halfcells[1],cellbox[cellboxidy * 3   2]   
halfcells[2]};

    if (verticess[verticessidx * 3] >= mincell[0] && verticess[verticessidx * 3] <= 
maxcell[0]  && verticess[verticessidx * 3   1] >= mincell[1]  &&  verticess[verticessidx 
* 3   1] <= maxcell[1] && verticess[verticessidx * 3   2] >= mincell[2] && 
verticess[verticessidx * 3   2] <= maxcell[2]){
    
        C[cellboxidy * 3] = 1;
        C[cellboxidy * 3   1] = 1;
        C[cellboxidy * 3   2] = 1;

    }

}

DELLEXPORT void twodarray (double *out , double *celb ,double *ve , double *hfc , 
double *sizecellbox , double *sizeverticess)
{    


    double cellbox [24] = {0};


    for(int i = 0 ; i<int(*sizecellbox) ; i  ){
        cellbox[(i*3)] = celb[(i*3)];
        cellbox[(i*3)   1] = celb[(i*3)   1];
        cellbox[(i*3)   2] = celb[(i*3)   2];


    }



    double verticess [24] = {0};
    //cout<<int(*sizeverticess)<<endl;
    for(int i = 0 ; i<int(*sizeverticess) ; i  ){
        verticess[(i*3)] = ve[(i*3)];
        verticess[(i*3)   1] = ve[(i*3)   1];
        verticess[(i*3)   2] = ve[(i*3)   2];

    }



    const int N = *sizecellbox;
    int M = *sizeverticess;


    double halfcells[3] = {};

    halfcells[0] = hfc[0];
    halfcells[1] = hfc[1];
    halfcells[2] = hfc[2];


    double *C;
    double *cellbox_d = 0, *verticess_d = 0 , *halfcells_d = 0 , *C_d = 0;
    int rows = N;
    int cols = M;

    dim3 blockDim(*sizecellbox, *sizeverticess);


    C = (double *)malloc (sizeof(*C)*100);
    cudaMalloc ((void**) &C_d, sizeof(*C_d)*100);
    cudaMalloc ((void**) &cellbox_d, sizeof(*cellbox_d)*100);
    cudaMalloc ((void**) &verticess_d, sizeof(*verticess_d)*100);
    cudaMalloc ((void**) &halfcells_d, sizeof(*halfcells_d)*3);

    cudaMemcpy (cellbox_d, cellbox, sizeof(*cellbox_d)*100, cudaMemcpyHostToDevice);
    cudaMemcpy (verticess_d, verticess, sizeof(*verticess_d)*100, 
cudaMemcpyHostToDevice);
    cudaMemcpy (halfcells_d, &halfcells, sizeof(*halfcells_d)*3, 
cudaMemcpyHostToDevice);

    MatAdd<<<1,blockDim>>>(cellbox_d, verticess_d, C_d, halfcells_d  , rows, cols);

    cudaMemcpy (C, C_d, sizeof(*C)*100, cudaMemcpyDeviceToHost);

    for (int i=0 ; i<24 ; i  ){
        cout<<C[i]<<endl;
    }
    out = C;

    cudaFree (cellbox);
    cudaFree (verticess_d);
    cudaFree (C_d);
    cudaFree(halfcells_d);
    free (C);
 }
 //nvcc -Xcompiler -fPIC -shared -o 2darray3.dll 2darray3.cu

python script

import numpy as np
import ctypes
from ctypes import * 

def get_cuda_square():
    dll = ctypes.windll.LoadLibrary("C:\\Users\\Ali\\Desktop\\test9\\cuda_remember\\with 
link python\\2darray3.dll") 
    func = dll.twodarray
    func.argtypes = [POINTER(c_double), POINTER(c_double), POINTER(c_double), 
POINTER(c_double),POINTER(c_double) , POINTER(c_double)] 
    return func

__cuda_square = get_cuda_square()

size = int(24)

cellbox_array = [[-0.35263609886169434, -0.35263609886169434, -0.35263609886169434] , 
[0.35263609886169434, -0.35263609886169434, -0.35263609886169434] , 
[-0.35263609886169434, 0.35263609886169434, -0.35263609886169434] , 
[0.35263609886169434, 0.35263609886169434, -0.35263609886169434],[-0.35263609886169434, 
-0.35263609886169434, 0.35263609886169434] , [0.35263609886169434, -0.35263609886169434, 
0.35263609886169434],[-0.35263609886169434, 0.35263609886169434, 0.35263609886169434] , 
[0.35263609886169434, 0.35263609886169434, 0.35263609886169434]]
cellboxsize = c_double(len(cellbox_array))

verticess_array = [0.1, 0.2, 0.3] , [0.4, 0.5, 0.6] ,[0.7, 0.8, 0.9] , [0.10, 0.11, 
0.12],[0.13, 0.14, 0.15] , [0.16, 0.17, 0.18],[0.19, 0.20, 0.21] , [0.22, 0.23, 0.24]
verticesssize = c_double(len(verticess_array))

hfc_array = [1.7546,1.4456,1.4545544]
cellbox = np.array(cellbox_array).astype('double')
verticess = np.array(verticess_array).astype('double')
hfc = np.array(hfc_array).astype('double')
sc = np.array(cellboxsize).astype('double')
sv = np.array(cellboxsize).astype('double')

output = np.zeros(size).astype('double')

a_p = output.ctypes.data_as(POINTER(c_double))
c_p = cellbox.ctypes.data_as(POINTER(c_double))
v_p = verticess.ctypes.data_as(POINTER(c_double))
h_p = hfc.ctypes.data_as(POINTER(c_double))
sc_p = sc.ctypes.data_as(POINTER(c_double))
sv_p = sv.ctypes.data_as(POINTER(c_double))


__cuda_square(a_p, c_p, v_p , h_p,sc_p , sv_p)
print(output)

I can't get the value in python but when I run the tests with the cuda script it returns the correct result.

for example:

correct resoult (cuda): [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

Wrong result (python): [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

I did some tests on other scripts and found that the problem is probably with determining the appropriate data type for the values, but how can I solve this problem?

thank you:)

Note: The 1's in the output are for testing, and after the script is correct, I want to replace the decimal numbers that were the goal of the algorithm with 1's.

CodePudding user response：

In C (which is what is applicable for understanding here because of the extern "C" decorator), this is not a valid method to copy data from one place to another:

out = C;

That is overwriting a pointer value (which is passed by value, anyway) with another pointer value. The modified pointer value is not seen in the calling environment due to pass-by-value mechanics, therefore the modified/proper/expected data won't be "seen" in the calling environment either.

You are already providing an allocation for out in the calling environment (i.e. in your python code) so the only thing necessary, according to my testing, is to replace that line with an actual data copy, such as:

memcpy(out, C, sizeof(C[0])*24);

According to my testing, that causes the all-ones output to show up in the python environment.

FWIW this issue has nothing to do with CUDA. You would have a similar problem if you attempted that sort of data "return" even if not using CUDA at all.