Cuda misaglined address for a reused shared block memory-CodePudding

My kernel allocated a shared memory for data storage, but bug reports if I change the size of the shared memory, see codes attached.

#include <stdio.h>
#include <assert.h>

#define cucheck_dev(call)                                   \
{                                                           \
  cudaError_t cucheck_err = (call);                         \
  if(cucheck_err != cudaSuccess) {                          \
    const char *err_str = cudaGetErrorString(cucheck_err);  \
    printf("%s (%d): %s\n", __FILE__, __LINE__, err_str);   \
    assert(0);                                              \
  }                                                         \
}

__global__ void kernel(int datanum)
{
    extern __shared__ int sh[];

    // assign data for data 1
    float2* data_ptr1((float2*)sh);
    for (int thid = threadIdx.x; thid < datanum; thid  = blockDim.x)
    {
      data_ptr1[thid] = make_float2(0., 0.);
    }
    __syncthreads();

    // assign data for data 2
    
    size_t shOffset = (sizeof(float2)/sizeof(int)*(datanum));

    if(threadIdx.x == 0) printf("Offset: %d\n", (int)(shOffset));
    __syncthreads();

    float4 *data_ptr2((float4*)&sh[shOffset]);
    for (int thid = threadIdx.x; thid < datanum; thid  = blockDim.x)
    {
        data_ptr2[thid] = make_float4(0., 0., 0., 0.);
    }
    __syncthreads();
}

int main()
{
    int datanum = 21;     // bug reports for datanum = 21, but everthing works fine for datanum = 20
    int blocknum = 1;
    int threadperblock = 128;
    int preallocated = 768;

    size_t shmem = datanum*sizeof(float2)   preallocated*sizeof(int);

    printf("Allocated Shared memory byte: %d  Nums: %d\n", (int)shmem, (int)(shmem/sizeof(int)));

    kernel<<<blocknum, threadperblock, shmem>>>(datanum);
    cudaDeviceSynchronize();

    cucheck_dev(cudaGetLastError());
}

OS: Ubuntu 18.02 Cuda: 10.1 Device: RTX 2060 g : 7.5.0

As shown, the shared memory included two regions, one for fixed data, type as float2.

The other region may save different types as int or float4, offset from the shared memory entry.

When I set the datanum to 20, codes work fine.

But when datanum is changed to 21, code reports a misaligned address.

I greatly appreciate any reply or suggestions.

Thank you!

Some information provided by cuda-memcheck is posted here for a reference:

========= Invalid __shared__ write of size 16
=========     at 0x00000280 in kernel(int)
=========     by thread (20,0,0) in block (0,0,0)
=========     Address 0x000001e8 is misaligned
=========     Device Frame:kernel(int) (kernel(int) : 0x280)
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel   0x34e) [0x2efabe]
=========     Host Frame:test [0x13de9]
=========     Host Frame:test [0x13e77]
=========     Host Frame:test [0x4a1c5]
=========     Host Frame:test [0x6f32]
=========     Host Frame:test [0x6df5]
=========     Host Frame:test [0x6e2e]
=========     Host Frame:test [0x6c14]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main   0xe7) [0x21b97]
=========     Host Frame:test [0x69fa]
=========
========= Invalid __shared__ write of size 16
=========     at 0x00000280 in kernel(int)
=========     by thread (19,0,0) in block (0,0,0)
=========     Address 0x000001d8 is misaligned
=========     Device Frame:kernel(int) (kernel(int) : 0x280)
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel   0x34e) [0x2efabe]
=========     Host Frame:test [0x13de9]
=========     Host Frame:test [0x13e77]
=========     Host Frame:test [0x4a1c5]
=========     Host Frame:test [0x6f32]
=========     Host Frame:test [0x6df5]
=========     Host Frame:test [0x6e2e]
=========     Host Frame:test [0x6c14]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main   0xe7) [0x21b97]
=========     Host Frame:test [0x69fa]
=========
========= Invalid __shared__ write of size 16
=========     at 0x00000280 in kernel(int)
=========     by thread (18,0,0) in block (0,0,0)
=========     Address 0x000001c8 is misaligned
=========     Device Frame:kernel(int) (kernel(int) : 0x280)
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel   0x34e) [0x2efabe]
=========     Host Frame:test [0x13de9]
=========     Host Frame:test [0x13e77]
=========     Host Frame:test [0x4a1c5]
=========     Host Frame:test [0x6f32]
=========     Host Frame:test [0x6df5]
=========     Host Frame:test [0x6e2e]
=========     Host Frame:test [0x6c14]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main   0xe7) [0x21b97]
=========     Host Frame:test [0x69fa]
=========
========= Invalid __shared__ write of size 16
=========     at 0x00000280 in kernel(int)
=========     by thread (17,0,0) in block (0,0,0)
=========     Address 0x000001b8 is misaligned
=========     Device Frame:kernel(int) (kernel(int) : 0x280)
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel   0x34e) [0x2efabe]
=========     Host Frame:test [0x13de9]
=========     Host Frame:test [0x13e77]
=========     Host Frame:test [0x4a1c5]
=========     Host Frame:test [0x6f32]
=========     Host Frame:test [0x6df5]
=========     Host Frame:test [0x6e2e]
=========     Host Frame:test [0x6c14]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main   0xe7) [0x21b97]
=========     Host Frame:test [0x69fa]

CodePudding user response：

Your problem is that the alignment for float4 is higher than that for float2. Therefore the lines

size_t shOffset = (sizeof(float2)/sizeof(int)*(datanum));
float4 *data_ptr2((float4*)&sh[shOffset]);

do not guarantee appropriate alignment for data_ptr2 unless datanum is an even number.

I wrote some code for this issue here: CUDA : Shared memory alignement in documentation

The easiest fix is to just swap data_ptr1 and data_ptr2. Use the front of the memory for the type with the larger alignment