I have a kernel that uses cuda::std::complex<float>
, and in this kernel I want to do warp reduction, following this post.
The warpReduce
function:
template <typename T, unsigned int blockSize>
__device__ void warpReduce(volatile T *sdata, unsigned int tid) {
if (blockSize >= 64) sdata[tid] = sdata[tid 32];
if (blockSize >= 32) sdata[tid] = sdata[tid 16];
if (blockSize >= 16) sdata[tid] = sdata[tid 8];
if (blockSize >= 8) sdata[tid] = sdata[tid 4];
if (blockSize >= 4) sdata[tid] = sdata[tid 2];
if (blockSize >= 2) sdata[tid] = sdata[tid 1];
}
I'm getting the error: error : no operator " =" matches these operands, operand types are: volatile cuda::std::complex<float> = volatile cuda::std::complex<float>
.
Simply removing the volatile as mentioned in this post doesnt work.
Is there any way I can still use a complex type (thrust/cuda::std) in warp reduction?
kernel
template <unsigned int blockSize>
__global__ void reduce6(cuda::std::complex<float>*g_idata, cuda::std::complex<float>*g_odata, unsigned int n) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) tid;
unsigned int gridSize = blockSize*2*gridDim.x;
sdata[tid] = 0;
while (i < n) { sdata[tid] = g_idata[i] g_idata[i blockSize]; i = gridSize; }
__syncthreads();
if (blockSize >= 512) { if (tid < 256) { sdata[tid] = sdata[tid 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] = sdata[tid 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] = sdata[tid 64]; } __syncthreads(); }
if (tid < 32) warpReduce<cuda::std::complex<float>, blockSize>(sdata, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
I found a workaround by doing a reinterpret cast to float2/double2 first. But I dont know if this has any other implications. I read about undefined behaviour. Other suggestions?
This works:
template <typename T>
struct myComplex;
template <>
struct myComplex<float>
{
typedef float2 type;
};
template <>
struct myComplex<double>
{
typedef double2 type;
};
template <typename T>
__device__ void warpReduce(volatile T *SharedData, int tid)
{
SharedData[tid].x = SharedData[tid 32].x;
SharedData[tid].x = SharedData[tid 16].x;
SharedData[tid].x = SharedData[tid 8].x;
SharedData[tid].x = SharedData[tid 4].x;
SharedData[tid].x = SharedData[tid 2].x;
SharedData[tid].x = SharedData[tid 1].x;
SharedData[tid].y = SharedData[tid 32].y;
SharedData[tid].y = SharedData[tid 16].y;
SharedData[tid].y = SharedData[tid 8].y;
SharedData[tid].y = SharedData[tid 4].y;
SharedData[tid].y = SharedData[tid 2].y;
SharedData[tid].y = SharedData[tid 1].y;
}
// and then in the kernel:
warpReduce(reinterpret_cast<typename myComplex<T>::type *>(data), tid);
CodePudding user response:
According to my testing, in CUDA 11.7, the issue revolves around the use of volatile
.
According to this blog, this style of programming (implicit warp-synchronous) is deprecated.
additionally, this part of your posted code could not possibly be correct:
extern __shared__ int sdata[];
Combining these ideas, we can do the following:
$ cat t7.cu
#include <cuda/std/complex>
#include <iostream>
// assumes blocksize is 64 or larger power of 2 up to max of 512 (or 1024 see below)
template <typename T, unsigned int blockSize>
__device__ void warpReduce(T *sdata, unsigned int tid) {
T v = sdata[tid]; __syncwarp();
v = sdata[tid 32]; __syncwarp();
sdata[tid] = v; __syncwarp();
v = sdata[tid 16]; __syncwarp();
sdata[tid] = v; __syncwarp();
v = sdata[tid 8]; __syncwarp();
sdata[tid] = v; __syncwarp();
v = sdata[tid 4]; __syncwarp();
sdata[tid] = v; __syncwarp();
v = sdata[tid 2]; __syncwarp();
sdata[tid] = v; __syncwarp();
v = sdata[tid 1]; __syncwarp();
sdata[tid] = v;
}
template <typename T, unsigned int blockSize>
__global__ void reduce6(T *g_idata, T *g_odata, size_t n) {
extern __shared__ T sdata[];
unsigned int tid = threadIdx.x;
size_t i = blockIdx.x*(blockSize*2) tid;
size_t gridSize = blockSize*2*gridDim.x;
sdata[tid] = 0;
while (i < n) { sdata[tid] = g_idata[i] g_idata[i blockSize]; i = gridSize; }
__syncthreads();
// if (blockSize == 1024) { if (tid < 512) { sdata[tid] = sdata[tid 512]; } __syncthreads(); } // uncomment to support blocksize of 1024
if (blockSize >= 512) { if (tid < 256) { sdata[tid] = sdata[tid 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] = sdata[tid 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] = sdata[tid 64]; } __syncthreads(); }
if (tid < 32) warpReduce<T, blockSize>(sdata, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
using my_t = cuda::std::complex<float>;
int main(){
size_t n = 2048;
const unsigned blk = 128;
unsigned grid = n/blk/2;
my_t *i, *h_i;
my_t *o, *h_o;
h_i = new my_t[n];
h_o = new my_t[grid];
for (size_t i = 0; i < n; i ) h_i[i] = {1,1};
cudaMalloc(&i, n*sizeof(my_t));
cudaMalloc(&o, grid*sizeof(my_t));
cudaMemcpy(i, h_i, n*sizeof(my_t), cudaMemcpyHostToDevice);
reduce6<my_t, blk><<<grid,blk, blk*sizeof(my_t)>>>(i, o, n);
cudaMemcpy(h_o, o, grid*sizeof(my_t), cudaMemcpyDeviceToHost);
for (int i = 0; i < grid; i )
std::cout << cuda::std::real(h_o[i]) << "," << cuda::std::imag(h_o[i]) << std::endl;
cudaDeviceSynchronize();
}
$ nvcc -o t7 t7.cu
$ compute-sanitizer ./t7
========= COMPUTE-SANITIZER
256,256
256,256
256,256
256,256
256,256
256,256
256,256
256,256
========= ERROR SUMMARY: 0 errors
$