CUDA cudaMemcpy, an illegal memory access was encountered-CodePudding

i'm fairly new to cuda and i want to use the concept of constant memory, but i'm getting an illegal memory access was encountered when running the code.

My kernel looks like this

__global__ void nonceKernel(int inLen, int shaTermLength, BYTE* outSha1, BYTE* outNonce, int nonceLen, int* finishedFlag, int *mutex, int size)
{
        if(!*finishedFlag) return;

        unsigned int tid = blockIdx.x * blockDim.x   threadIdx.x;

        bool found = true;
        BYTE tempNonce[2];
        BYTE tempSha1[20];

        tempNonce[1]=((tid size) >> 8) & 0x000000FF;
        tempNonce[0]=(tid size) & 0x000000FF;

        CUDA_SHA1 ctx;

        cuda_sha1_init(&ctx); //init context
        cuda_sha1_update(&ctx, device_input_data, inLen); // add input buffer
        cuda_sha1_update(&ctx, tempNonce, nonceLen); //add nonce
        cuda_sha1_final(&ctx, tempSha1); //compute sha1

        for(int i=0; i<shaTermLength; i  ) {
                if(tempSha1[19 - i] != device_sha1_term[shaTermLength - 1 - i])
                        found=false;
        }

        if(found == true) {
                lock(mutex);
                memcpy(outSha1, tempSha1, 20); //20 bytes for sha1
                memcpy(outNonce, tempNonce, nonceLen); //2 bytes for nonce
                *finishedFlag = 0;
                unlock(mutex);
        }
}

My intermediary function like this:

cudaError_t nonceWithCuda(int intlen, int shaTermLength, BYTE* outSha1, BYTE* outNonce, int *finishFlag, int nonceLen, int size)
{
        BYTE *gpuSha1Out;
        BYTE *gpuNonceOut;
        int *gpuFinishedFlag;
        cudaError_t cudaStatus;
        int *mutex;

        cudaStatus= cudaSetDevice(0);
        if(cudaStatus != cudaSuccess) {
                fprintf(stderr, "cudaSetDevice failed! Do you have a cuda gpu installed?");
                goto Error;
        }
        ....
        cudaStatus=cudaMalloc((void**)&gpuFinishedFlag, 1*sizeof(int));
        if(cudaStatus != cudaSuccess) {
                fprintf(stderr, "cudaMalloc for gpuFinishedFlag failed");
                goto Error;

        }

        cudaStatus=cudaMemcpy(gpuFinishedFlag, finishFlag, sizeof(int), cudaMemcpyHostToDevice);
        if(cudaStatus!=cudaSuccess) {
                fprintf(stderr, "cudamemcpy 0 to gpuFinishedFlag failed!");
                goto Error;
        }
        ....
        while(*finishFlag) {
                nonceKernel<<<128, 1024>>>(intlen, shaTermLength, gpuSha1Out, gpuNonceOut, nonceLen, gpuFinishedFlag, mutex, size);
                size  ;
                cudaStatus=cudaMemcpy(finishFlag, gpuFinishedFlag, sizeof(int), cudaMemcpyDeviceToHost);
                if(cudaStatus!=cudaSuccess) {
                        fprintf(stderr, "cudaMemcpy from gpuFinishedFlag failed, with code: %s!", cudaGetErrorString(cudaStatus));
                        goto Error;
                }
        }
        ......

Error:
        cudaFree(gpuSha1Out);
        cudaFree(gpuNonceOut);
        cudaFree(gpuFinishedFlag);

        return cudaStatus;
}

Also i'm declaring the constant variables as such:

__constant__ BYTE* device_input_data;
__constant__ BYTE* device_sha1_term;

where BYTE is defined as an unsigned char typedef unsigned char BYTE;.

And finally the main function.

int main(int argc, char** argv) {

        size_t input_block_size=5; //bytes
        int nonceLen=2;
        int finishedFlag=1;

        ...

        BYTE* inputData = (BYTE*) malloc(input_block_size * sizeof(BYTE)); //input byte buffer
        inputData[0]=0x23; //#
        inputData[1]=0x30; //0
        inputData[2]=0x42; //B
        inputData[3]=0x69; //i
        inputData[4]=0x61; //a
        BYTE* shaTerm = (BYTE*) malloc(nonceLen * sizeof(BYTE));
        shaTerm[0]=0x7E;
        shaTerm[1]=0x46;
        int shaTermLength = sizeof(shaTerm)/sizeof(shaTerm[0]);//ouput sha buffer
        cudaStatus=cudaMemcpyToSymbol(device_input_data, inputData, input_block_size * sizeof(BYTE), 0, cudaMemcpyHostToDevice);
        fprintf(stderr, "MemcpyToSymbol: %s\n", cudaGetErrorString(cudaStatus));
        cudaStatus=cudaMemcpyToSymbol(device_sha1_term, shaTerm, shaTermLength * sizeof(BYTE), 0, cudaMemcpyHostToDevice);
        fprintf(stderr, "MemcpyToSymbol: %s\n", cudaGetErrorString(cudaStatus));
        ...
        nonceWithCuda(input_block_size, shaTermLength, outputSha1Buffer, outputNonceBuffer, &finishedFlag, 2, size);

The error occurs in the while from the nonceWithCuda function, when i'm copying back the value from the gpu to host, i mean this piece of code:

cudaStatus=cudaMemcpy(finishFlag, gpuFinishedFlag, sizeof(int), cudaMemcpyDeviceToHost);
if(cudaStatus!=cudaSuccess) {
       fprintf(stderr, "cudaMemcpy from gpuFinishedFlag failed, with code: %s!", cudaGetErrorString(cudaStatus));
       goto Error;
}

The output:

$ ./nonce_v3
MemcpyToSymbol: no error
MemcpyToSymbol: no error
cudaMemcpy from gpuFinishedFlag failed, with code: an illegal memory access was encountered!

Note that the same code works fine when i'm not using constant for those two variables and cannot understand why. Can someone point me in the right direction?

Thank you for your help!!!

CodePudding user response：

I am assuming that you want to store the 5 elements of inputData in constant memory.

The line __constant__ BYTE* device_input_data; will reserve constant memory to store a single pointer. It will not reserve constant memory for 5 BYTE values.

Then, with

cudaMemcpyToSymbol(device_input_data, inputData, input_block_size * sizeof(BYTE), 0, cudaMemcpyHostToDevice);

the memory adress to which this pointer points to is set to the elements of inputData, i.e. after transfer, the pointer could have the value 0x2330426961000000 . Most likely, this is not a valid address to device memory. This causes the observed memory error when trying to access this memory location in your kernel.

To fix this, you need to declare the constant memory as a BYTE array of size 5.

__constant__ BYTE device_input_data[5];