CUDA no operator = for volatile cuda::std::complex<float>-CodePudding

I have a kernel that uses cuda::std::complex<float>, and in this kernel I want to do warp reduction, following this post.

The warpReduce function:

template <typename T, unsigned int blockSize>
__device__ void warpReduce(volatile T *sdata, unsigned int tid) {
    if (blockSize >= 64) sdata[tid]  = sdata[tid   32];
    if (blockSize >= 32) sdata[tid]  = sdata[tid   16];
    if (blockSize >= 16) sdata[tid]  = sdata[tid   8];
    if (blockSize >= 8) sdata[tid]  = sdata[tid   4];
    if (blockSize >= 4) sdata[tid]  = sdata[tid   2];
    if (blockSize >= 2) sdata[tid]  = sdata[tid   1];
}

I'm getting the error: error : no operator " =" matches these operands, operand types are: volatile cuda::std::complex<float> = volatile cuda::std::complex<float>. Simply removing the volatile as mentioned in this post doesnt work. Is there any way I can still use a complex type (thrust/cuda::std) in warp reduction?

kernel

template <unsigned int blockSize>
__global__ void reduce6(cuda::std::complex<float>*g_idata, cuda::std::complex<float>*g_odata, unsigned int n) {
    extern __shared__ int sdata[];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockSize*2)   tid;
    unsigned int gridSize = blockSize*2*gridDim.x;
    sdata[tid] = 0;
    while (i < n) { sdata[tid]  = g_idata[i]   g_idata[i blockSize]; i  = gridSize; }
    __syncthreads();
    if (blockSize >= 512) { if (tid < 256) { sdata[tid]  = sdata[tid   256]; } __syncthreads(); }
    if (blockSize >= 256) { if (tid < 128) { sdata[tid]  = sdata[tid   128]; } __syncthreads(); }
    if (blockSize >= 128) { if (tid < 64) { sdata[tid]  = sdata[tid   64]; } __syncthreads(); }
    if (tid < 32) warpReduce<cuda::std::complex<float>, blockSize>(sdata, tid);
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

I found a workaround by doing a reinterpret cast to float2/double2 first. But I dont know if this has any other implications. I read about undefined behaviour. Other suggestions?

This works:

template <typename T>
struct myComplex;

template <>
struct myComplex<float>
{
    typedef float2 type;
};

template <>
struct myComplex<double>
{
    typedef double2 type;
};

template <typename T>
__device__ void warpReduce(volatile T *SharedData, int tid)
{
    SharedData[tid].x  = SharedData[tid   32].x;
    SharedData[tid].x  = SharedData[tid   16].x;
    SharedData[tid].x  = SharedData[tid   8].x;
    SharedData[tid].x  = SharedData[tid   4].x;
    SharedData[tid].x  = SharedData[tid   2].x;
    SharedData[tid].x  = SharedData[tid   1].x;

    SharedData[tid].y  = SharedData[tid   32].y;
    SharedData[tid].y  = SharedData[tid   16].y;
    SharedData[tid].y  = SharedData[tid   8].y;
    SharedData[tid].y  = SharedData[tid   4].y;
    SharedData[tid].y  = SharedData[tid   2].y;
    SharedData[tid].y  = SharedData[tid   1].y;
}

// and then in the kernel:
warpReduce(reinterpret_cast<typename myComplex<T>::type *>(data), tid);

CodePudding user response：

According to my testing, in CUDA 11.7, the issue revolves around the use of volatile.

According to this blog, this style of programming (implicit warp-synchronous) is deprecated.

additionally, this part of your posted code could not possibly be correct:

extern __shared__ int sdata[];

Combining these ideas, we can do the following:

$ cat t7.cu
#include <cuda/std/complex>
#include <iostream>
// assumes blocksize is 64 or larger power of 2 up to max of 512 (or 1024 see below)
template <typename T, unsigned int blockSize>
__device__ void warpReduce(T *sdata, unsigned int tid) {

  T v = sdata[tid];   __syncwarp();
  v  = sdata[tid 32]; __syncwarp();
  sdata[tid] = v;     __syncwarp();
  v  = sdata[tid 16]; __syncwarp();
  sdata[tid] = v;     __syncwarp();
  v  = sdata[tid 8];  __syncwarp();
  sdata[tid] = v;     __syncwarp();
  v  = sdata[tid 4];  __syncwarp();
  sdata[tid] = v;     __syncwarp();
  v  = sdata[tid 2];  __syncwarp();
  sdata[tid] = v;     __syncwarp();
  v  = sdata[tid 1];  __syncwarp();
  sdata[tid] = v;
}

template <typename T, unsigned int blockSize>
__global__ void reduce6(T *g_idata, T *g_odata, size_t n) {
    extern __shared__ T sdata[];
    unsigned int tid = threadIdx.x;
    size_t i = blockIdx.x*(blockSize*2)   tid;
    size_t gridSize = blockSize*2*gridDim.x;
    sdata[tid] = 0;
    while (i < n) { sdata[tid]  = g_idata[i]   g_idata[i blockSize]; i  = gridSize; }
    __syncthreads();
//  if (blockSize == 1024) { if (tid < 512) { sdata[tid]  = sdata[tid   512]; } __syncthreads(); }  // uncomment to support blocksize of 1024
    if (blockSize >= 512)  { if (tid < 256) { sdata[tid]  = sdata[tid   256]; } __syncthreads(); }
    if (blockSize >= 256)  { if (tid < 128) { sdata[tid]  = sdata[tid   128]; } __syncthreads(); }
    if (blockSize >= 128)  { if (tid < 64)  { sdata[tid]  = sdata[tid   64];  } __syncthreads(); }
    if (tid < 32) warpReduce<T, blockSize>(sdata, tid);
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

using my_t = cuda::std::complex<float>;
int main(){
  size_t n = 2048;
  const unsigned blk = 128;
  unsigned grid = n/blk/2;
  my_t *i, *h_i;
  my_t *o, *h_o;
  h_i = new my_t[n];
  h_o = new my_t[grid];
  for (size_t i = 0; i < n; i  ) h_i[i] = {1,1};
  cudaMalloc(&i, n*sizeof(my_t));
  cudaMalloc(&o, grid*sizeof(my_t));
  cudaMemcpy(i, h_i, n*sizeof(my_t), cudaMemcpyHostToDevice);
  reduce6<my_t, blk><<<grid,blk, blk*sizeof(my_t)>>>(i, o, n);
  cudaMemcpy(h_o, o, grid*sizeof(my_t), cudaMemcpyDeviceToHost);
  for (int i = 0; i < grid; i  )
    std::cout << cuda::std::real(h_o[i]) << "," <<  cuda::std::imag(h_o[i]) <<  std::endl;
  cudaDeviceSynchronize();
}
$ nvcc -o t7 t7.cu
$ compute-sanitizer ./t7
========= COMPUTE-SANITIZER
256,256
256,256
256,256
256,256
256,256
256,256
256,256
256,256
========= ERROR SUMMARY: 0 errors
$