I just started with CUDA and wanted to write a simple C program using Visual Studio that can find total number of prime numbers from 1-N. I did this:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdio>
#include <cmath>
static void HandleError(cudaError_t err,
const char* file,
int line) {
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err),
file, line);
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
__global__ void countPrimeGPU(const int* array, int* count)
{
int number = array[threadIdx.x blockIdx.x * blockDim.x];
if (number <= 1)
return;
for (int i = 2; i <= floor(pow(number, 0.5)); i )
if (!(number % i))
return;
atomicAdd(count, 1);
}
int main()
{
int* host_array;
int* dev_array;
const int size = 9; // prime numbers from 1-9
// allocating & initializing host array
host_array = new int[size];
for (int i = 0; i < size; i )
host_array[i] = i 1; // 1, 2, ..., size
int host_primeCount = 0;
int* dev_pprimeCount;
HANDLE_ERROR(cudaSetDevice(0));
HANDLE_ERROR(cudaMalloc((void**)&dev_array, size * sizeof(int)));
HANDLE_ERROR(cudaMemcpy(dev_array, host_array, size * sizeof(int), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMalloc((void**)&dev_pprimeCount, sizeof(int)));
HANDLE_ERROR(cudaMemcpy(dev_pprimeCount, &host_primeCount, sizeof(int), cudaMemcpyHostToDevice));
countPrimeGPU <<< size, 1 >>> (dev_array, dev_pprimeCount); // !!!
HANDLE_ERROR(cudaGetLastError());
HANDLE_ERROR(cudaDeviceSynchronize());
HANDLE_ERROR(cudaMemcpy(&host_primeCount, dev_pprimeCount, sizeof(int), cudaMemcpyDeviceToHost));
printf("Prime count for the first %d numbers: %d\n", size, host_primeCount);
cudaFree(dev_array);
cudaFree(dev_pprimeCount);
HANDLE_ERROR(cudaDeviceReset());
delete[] host_array;
return 0;
}
The problem here is that I get correct result only when size
is in interval [1-8]
. But, when setting it to 9
or above it is always incorrect. What am I doing wrong? I suspect I have incorrectly set the configuration (number of blocks/threads) when calling countPrimeGPU
, but was unable to fix it. Ultimately, I would like to test it with size=10'000'000
, and compare it with my multi-threaded CPU implementation. Thank you.
CodePudding user response:
The proximal issue is that when number
is 9, floor(pow(number, 0.5))
is giving you 2, not 3 as you expect. As a result, 9 is incorrectly flagged as a prime.
here is a similar question. pow()
(at least, in CUDA device code) does not have the absolute accuracy you desire/need when using floor()
(i.e. truncation). You might have a workable path using ordinary rounding rather than truncation (since testing against a factor slightly larger than the square root is not going to disrupt the correctness of your approach), but the method I would suggest to address this is to change your for-loop as follows:
for (int i = 2; (i*i) <= number; i )
This avoids the floating-point head scratching, and should be computationally simpler as well. For your desired range of number
(up to 10,000,000) the quantity i*i
will fit in a int
type/value, so I don't forsee issues there.
Since this is just a learning exercise, I'm not trying to scrub your code for all sorts of optimization improvements. To pick one example, launching blocks of 1 thread each:
countPrimeGPU <<< size, 1 >>> (dev_array, dev_pprimeCount); // !!!
is not particularly efficient on the GPU, but it is not algorithmically incorrect to do so.