I am learning CUDA by myself. My ultimate goal is to apply it to Fortran, but because a lot of classes/videos are based on C/C , I often end up to perform the same exercise in both (which is a good thing). Currently, I am trying to run a basic exercise which performs a(i) = b(i) c(i) on the GPU. For completeness, I am posting both codes for comparison:
- C code below
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_common.cuh"
#include "common.h"
//assume grid is 1D and block is 1D then nx = size
__global__ void sum_arrays_1Dgrid_1Dblock(float* a, float* b, float *c, int nx)
{
int gid = blockIdx.x * blockDim.x threadIdx.x;
if (gid < nx)
c[gid] = a[gid] b[gid];
}
void run_sum_array_1d(int argc, char** argv)
{
printf("Runing 1D grid \n");
int size = 1 << 22;
int block_size = 128;
int nx, ny = 0;
if (argc > 2)
size = 1 << atoi(argv[2]);
if (argc > 4)
block_size = 1 << atoi(argv[4]);
unsigned int byte_size = size * sizeof(float);
printf("Input size : %d \n",size);
float * h_a, *h_b, *h_out, *h_ref;
h_a = (float*)malloc(byte_size);
h_b = (float*)malloc(byte_size);
h_out = (float*)malloc(byte_size);
h_ref = (float*)malloc(byte_size);
if (!h_a)
printf("host memory allocation error \n");
for (size_t i = 0; i < size; i )
{
h_a[i] = i % 10;
h_b[i] = i % 7;
}
clock_t cpu_start, cpu_end;
cpu_start = clock();
sum_array_cpu(h_a, h_b, h_out,size);
cpu_end = clock();
dim3 block( block_size);
dim3 grid((size block.x -1)/block.x);
printf("Kernel is lauch with grid(%d,%d,%d) and block(%d,%d,%d) \n",
grid.x,grid.y,grid.z,block.x,block.y, block.z);
float *d_a, *d_b, *d_c;
gpuErrchk(cudaMalloc((void**)&d_a, byte_size));
gpuErrchk(cudaMalloc((void**)&d_b, byte_size));
gpuErrchk(cudaMalloc((void**)&d_c, byte_size));
gpuErrchk(cudaMemset(d_c,0,byte_size));
clock_t htod_start, htod_end;
htod_start = clock();
gpuErrchk(cudaMemcpy(d_a,h_a,byte_size,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, h_b, byte_size, cudaMemcpyHostToDevice));
htod_end = clock();
clock_t gpu_start, gpu_end;
gpu_start = clock();
sum_arrays_1Dgrid_1Dblock << <grid, block >> > (d_a, d_b, d_c, size);
gpuErrchk(cudaDeviceSynchronize());
gpu_end = clock();
clock_t dtoh_start, dtoh_end;
dtoh_start = clock();
gpuErrchk(cudaMemcpy(h_ref,d_c,byte_size,cudaMemcpyDeviceToHost));
dtoh_end = clock();
compare_arrays(h_out, h_ref,size);
// elapsed time comparison
printf("Sum array CPU execution time [ms] : %4.6f \n",
(double)((double)1000.0*(cpu_end - cpu_start)/CLOCKS_PER_SEC));
printf("Sum array GPU execution time [ms] : %4.6f \n",
(double)((double)1000.0*(gpu_end - gpu_start)/CLOCKS_PER_SEC));
printf("htod mem transfer time [ms] : %4.6f \n",
(double)((double)1000.0*(htod_end - htod_start)/CLOCKS_PER_SEC));
printf("dtoh mem transfer time [ms] : %4.6f \n",
(double)((double)1000.0*(dtoh_end - dtoh_start)/CLOCKS_PER_SEC));
printf("Total GPU execution time [ms] : %4.6f \n",
(double)((double)1000.0*(dtoh_end - htod_start)/CLOCKS_PER_SEC));
cudaFree(d_c);
cudaFree(d_b);
cudaFree(d_a);
free(h_ref);
free(h_out);
free(h_b);
free(h_a);
}
////arguments :
////1 - kernel (0:1D or 1:2D),
////2 - input size (2 pow (x))
////3 - for 2D kernel nx,
////4 - block.x
////5 - block.y
int main(int argc, char** argv)
{
printf("\n----------------------- SUM ARRAY EXAMPLE FOR NVPROF ------------------------ \n\n");
printf("argc : %d \n",argc);
for (int i = 0; i < argc; i )
{
printf("argv : %s \n",argv[i]);
};
run_sum_array_1d(argc, argv);
//query_device();
return 0;
}
- Fortran code below
#include 'Error.fpp'
MODULE CUDAOps
USE cudafor
USE CUDAUtils
USE CPUOps
IMPLICIT NONE
CONTAINS
ATTRIBUTES(GLOBAL) SUBROUTINE sumArraysGPU_1D(a,b,c,Nsize)
IMPLICIT NONE
!> intent variables
INTEGER, INTENT(IN), DIMENSION(:) :: a,b
INTEGER, INTENT(INOUT), DIMENSION(:) :: c
INTEGER, INTENT(IN), VALUE :: Nsize
!> local variables
INTEGER :: blockId, threadId
! get the blockId
blockId = (blockIdx%x-1)
! get the threadId
threadId = blockId * blockDim%x threadIdx%x-1
! adjust to let the threadId to start from 1
threadId = threadId 1
!WRITE(*,*) 'threadId = ',threadId
! set the maximum
IF (threadId <= Nsize) THEN
! perform the sum
c(threadId) = a(threadId) b(threadId)
END IF
END SUBROUTINE sumArraysGPU_1D
SUBROUTINE runSumArrays1D(xpow,blockSizeX)
IMPLICIT NONE
! intent variables
INTEGER, INTENT(IN) :: xpow,blockSizeX
!> variables declaration
! size of the arrays
INTEGER:: Nsize
! size of the GPU block
INTEGER:: block_size
! other auxiliary variables
INTEGER :: i,j,istat
REAL(KIND=wp) :: t1,t2,time,timeGPU
TYPE(cudaEvent) :: startEvent, stopEvent
! host data allocation
INTEGER, DIMENSION(:), ALLOCATABLE :: h_a, h_b, h_c, gpu_results
! device data allocation
INTEGER, DIMENSION(:), ALLOCATABLE, DEVICE :: d_a, d_b, d_c
! define the GPU grid and block
TYPE(DIM3) :: grid, tBlock
! define data size and block size along X dimension
Nsize = 2**xpow
block_size = 2**blockSizeX
! allocate memory in host
ALLOCATE(h_a(Nsize))
ALLOCATE(h_b(Nsize))
ALLOCATE(h_c(Nsize))
! allocate memory in device
ALLOCATE(gpu_results(Nsize))
ALLOCATE(d_a(Nsize))
ALLOCATE(d_b(Nsize))
ALLOCATE(d_c(Nsize))
! define block and grid
tBlock = DIM3(block_size,1,1)
grid = DIM3((Nsize/tBlock%x),1,1)
! host data initialization
CALL generateNumberByIntegerDivision(h_a,10,Nsize)
CALL generateNumberByIntegerDivision(h_b,7,Nsize)
WRITE(*,*) 'Kernel is going to be launched with'
WRITE(*,*) 'Nsize = ',Nsize
WRITE(*,*) 'xpow = ',xpow
WRITE(*,*) 'blockSizeX = ',blockSizeX
WRITE(*,*) 'block_size = ',block_size
WRITE(*,*) 'grid.x = ',grid%x
WRITE(*,*) 'grid.y = ',grid%y
WRITE(*,*) 'grid.z = ',grid%z
WRITE(*,*) 'block.x = ',tblock%x
WRITE(*,*) 'block.y = ',tblock%y
WRITE(*,*) 'block.z = ',tblock%z
timeGPU = 0.0_wp
CALL CPU_TIME(t1)
! perform the sum in serial using the CPU
CALL sumArraysCPU(h_a,h_b,h_c)
CALL CPU_TIME(t2)
WRITE(*,*) 'time for the CPU implementation (ms) = ',(t2-t1)*1e3
! initialize CUDA events
!istat = cudaEventCreate(startEvent)
GPU_ERROR(cudaEventCreate(startEvent))
istat = cudaEventCreate(stopEvent)
! copy the source data h_a from CPU to GPU
istat = cudaEventRecord(startEvent,0)
istat = cudaMemCpy(d_a,h_a,Nsize,cudaMemcpyHostToDevice)
istat = cudaEventRecord(stopEvent,0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time, startEvent, stopEvent)
WRITE(*,*) 'time to transfer h_a to GPU (ms) = ',time
timeGPU = timeGPU time
! copy the source data h_b from CPU to GPU
istat = cudaEventRecord(startEvent,0)
istat = cudaMemCpy(d_b,h_b,Nsize,cudaMemcpyHostToDevice)
istat = cudaEventRecord(stopEvent,0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time, startEvent, stopEvent)
WRITE(*,*) 'time to transfer h_b to GPU (ms) = ',time
timeGPU = timeGPU time
! perform the sum on the GPU
istat = cudaEventRecord(startEvent,0)
CALL sumArraysGPU_1D<<<grid, tBlock>>>(d_a,d_b,d_c,Nsize)
istat = cudaEventRecord(stopEvent,0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time, startEvent, stopEvent)
WRITE(*,*) 'time to perform the sum on GPU (ms) = ',time
timeGPU = timeGPU time
! copy the data back from GPU to CPU
istat = cudaEventRecord(startEvent,0)
istat = cudaMemCpy(gpu_results,d_c,Nsize,cudaMemcpyDeviceToHost)
istat = cudaEventRecord(stopEvent,0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time, startEvent, stopEvent)
WRITE(*,*) 'time to copy back data from GPU to CPU (ms) = ',time
timeGPU = timeGPU time
WRITE(*,*) 'Total time to execute GPU (ms) :',timeGPU
!WRITE(*,*) 'h_c = ',h_c
!WRITE(*,*) 'gpu_results = ',gpu_results
! make a formal check of the result component by component
CALL checkArraysCPU(h_c,gpu_results,Nsize)
WRITE(*,*) 'SUM(h_c) = ',SUM(h_c)
WRITE(*,*) 'SUM(gpu_results) = ',SUM(gpu_results)
! deallocate memory in host
DEALLOCATE(h_a)
DEALLOCATE(h_b)
DEALLOCATE(h_c)
! deallocate memory in device
DEALLOCATE(gpu_results)
DEALLOCATE(d_a)
DEALLOCATE(d_b)
DEALLOCATE(d_c)
END SUBROUTINE runSumArrays1D
PROGRAM main
USE CPUOps
USE CUDAOps
IMPLICIT NONE
! declare local variables
INTEGER :: i,xpow,sizeBlockX
! set the default values
xpow = 22
sizeBlockX = 7
! lanuch the dedicated routines
CALL runSumArrays1D(xpow,sizeBlockX)
STOP
END PROGRAM main
When I run the code with the default options (data size and block size) using nvprof, using this command for both codes:
nvprof ./code.x
I get the following output.
- for the C code:
----------------------- SUM ARRAY EXAMPLE FOR NVPROF ------------------------
Runing 1D grid
Input size : 4194304
Kernel is lauch with grid(32768,1,1) and block(128,1,1)
==33351== NVPROF is profiling process 33351, command: ./code_c.x
Arrays are same
Sum array CPU execution time [ms] : 4.850000
Sum array GPU execution time [ms] : 1.610000
htod mem transfer time [ms] : 10.640000
dtoh mem transfer time [ms] : 5.759000
Total GPU execution time [ms] : 18.011000
==33351== Profiling application: ./code_c.x
==33351== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 61.35% 10.715ms 2 5.3577ms 5.3566ms 5.3589ms [CUDA memcpy HtoD]
30.94% 5.4040ms 1 5.4040ms 5.4040ms 5.4040ms [CUDA memcpy DtoH]
5.81% 1.0147ms 1 1.0147ms 1.0147ms 1.0147ms sum_arrays_1Dgrid_1Dblock(float*, float*, float*, int)
1.90% 331.81us 1 331.81us 331.81us 331.81us [CUDA memset]
API calls: 75.67% 60.242ms 3 20.081ms 55.398us 60.116ms cudaMalloc
20.59% 16.393ms 3 5.4645ms 5.2016ms 5.7578ms cudaMemcpy
2.00% 1.5906ms 1 1.5906ms 1.5906ms 1.5906ms cudaDeviceSynchronize
1.47% 1.1673ms 3 389.10us 186.65us 497.81us cudaFree
0.14% 107.71us 101 1.0660us 88ns 57.578us cuDeviceGetAttribute
0.08% 65.483us 1 65.483us 65.483us 65.483us cuDeviceGetName
0.02% 17.946us 1 17.946us 17.946us 17.946us cudaMemset
0.02% 16.011us 1 16.011us 16.011us 16.011us cudaLaunchKernel
0.01% 8.6300us 1 8.6300us 8.6300us 8.6300us cuDeviceGetPCIBusId
0.00% 1.1600us 3 386ns 146ns 846ns cuDeviceGetCount
0.00% 369ns 2 184ns 94ns 275ns cuDeviceGet
0.00% 246ns 1 246ns 246ns 246ns cuDeviceTotalMem
0.00% 194ns 1 194ns 194ns 194ns cuModuleGetLoadingMode
0.00% 167ns 1 167ns 167ns 167ns cuDeviceGetUuid
- for the Fortran code:
==38266== NVPROF is profiling process 38266, command: ./code_f.x
Kernel is going to be launched with
Nsize = 4194304
xpow = 22
blockSizeX = 7
block_size = 128
grid.x = 32768
grid.y = 1
grid.z = 1
block.x = 128
block.y = 1
block.z = 1
time for the CPU implementation (ms) = 4.997969
time to transfer h_a to GPU (ms) = 5.680192
time to transfer h_b to GPU (ms) = 5.561248
time to perform the sum on GPU (ms) = 1.510400
time to copy back data from GPU to CPU (ms) = 7.039712
Total time to execute GPU (ms) : 19.79155
Arrays are the same!
SUM(h_c) = 1592097881
SUM(gpu_results) = 1592097881
==38266== Profiling application: ./code_f.x
==38266== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 58.75% 10.911ms 5 2.1822ms 1.2160us 5.4682ms [CUDA memcpy HtoD]
35.16% 6.5297ms 1 6.5297ms 6.5297ms 6.5297ms [CUDA memcpy DtoH]
6.10% 1.1321ms 1 1.1321ms 1.1321ms 1.1321ms cudaops_sumarraysgpu_1d_
API calls: 87.80% 150.69ms 6 25.115ms 2.5020us 150.30ms cudaMalloc
9.95% 17.072ms 6 2.8454ms 4.1870us 7.0309ms cudaMemcpy
1.39% 2.3788ms 6 396.47us 2.2640us 1.1368ms cudaFree
0.72% 1.2281ms 4 307.02us 6.6590us 629.72us cudaEventSynchronize
0.05% 93.254us 101 923ns 92ns 41.961us cuDeviceGetAttribute
0.04% 64.982us 1 64.982us 64.982us 64.982us cuDeviceGetName
0.02% 36.395us 8 4.5490us 1.1180us 13.299us cudaEventRecord
0.02% 31.801us 2 15.900us 873ns 30.928us cudaEventCreate
0.01% 18.638us 1 18.638us 18.638us 18.638us cudaLaunchKernel
0.00% 6.3520us 4 1.5880us 970ns 2.5790us cudaEventElapsedTime
0.00% 4.9980us 1 4.9980us 4.9980us 4.9980us cuDeviceGetPCIBusId
0.00% 1.5290us 3 509ns 165ns 1.1890us cuDeviceGetCount
0.00% 444ns 2 222ns 92ns 352ns cuDeviceGet
0.00% 279ns 1 279ns 279ns 279ns cuModuleGetLoadingMode
0.00% 248ns 1 248ns 248ns 248ns cuDeviceTotalMem
0.00% 164ns 1 164ns 164ns 164ns cuDeviceGetUuid
What I am trying to understand here is why the number of "cudaMalloc", "cudaMemcpy" and "cudaFree" calls are in agreement with what I write in the C code, whereas they are not in the Fortran code. Specifically, while I perform the allocation of 3 arrays, cudaMalloc shows that I am calling it 6 times?
I am trying to understand whether there is a mistake/bug in my Fortran code or this is normal and if yes, for what reason. Thank you.
I have tried to play with the allocation statements of d_a, d_b and d_c arrays in Fortran. What it looks like is that the kernel call sort of does the cudaMalloc and memory copy again on top of the calls already done explicitly.
----------------- EDIT An additional question is. If I print some nvprof specific metrics, such as:
nvprof --metrics gld_efficiency,sm_efficiency,achieved_occupancy ./code.x
- this is the C output:
Invocations Metric Name Metric Description Min Max Avg
Device "NVIDIA GeForce MX330 (0)"
Kernel: sum_arrays_1Dgrid_1Dblock(float*, float*, float*, int)
1 gld_efficiency Global Memory Load Efficiency 100.00% 100.00% 100.00%
1 sm_efficiency Multiprocessor Activity 99.50% 99.50% 99.50%
1 achieved_occupancy Achieved Occupancy 0.922875 0.922875 0.922875
- and this is the Fortran output
Invocations Metric Name Metric Description Min Max Avg
Device "NVIDIA GeForce MX330 (0)"
Kernel: cudaops_sumarraysgpu_1d_
1 gld_efficiency Global Memory Load Efficiency 67.86% 67.86% 67.86%
1 sm_efficiency Multiprocessor Activity 99.62% 99.62% 99.62%
1 achieved_occupancy Achieved Occupancy 0.877743 0.877743 0.877743
You can clearly see a difference in the Global Memory Load Efficiency. Are the two issues related?
CodePudding user response:
In CUDA, at least, a fortran array has metadata that is needed by (CUDA) fortran device code generation, at least.
This metadata results in 2 allocations per fortran array. One for the actual data. One for the metadata.
One example of what metadata might be is "width" of the array. (I'm using the term fortran array loosely here. You won't always witness this metadata for any sort of device allocation.)
Naturally, since the metadata is needed (in this case, as determined by CUDA Fortran compiler), and allocated separately, you are going to witness 2 copy operations per array as well. And, correspondingly, 2 free operations as well.