Difference in nvprof output between a C and Fortran CUDA basic example-CodePudding

I am learning CUDA by myself. My ultimate goal is to apply it to Fortran, but because a lot of classes/videos are based on C/C , I often end up to perform the same exercise in both (which is a good thing). Currently, I am trying to run a basic exercise which performs a(i) = b(i) c(i) on the GPU. For completeness, I am posting both codes for comparison:

C code below

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "cuda_common.cuh"
#include "common.h"

//assume grid is 1D and block is 1D then nx = size
__global__ void sum_arrays_1Dgrid_1Dblock(float* a, float* b, float *c, int nx)
{
   int gid = blockIdx.x * blockDim.x   threadIdx.x;

   if (gid < nx)
      c[gid] = a[gid]   b[gid];
}


void run_sum_array_1d(int argc, char** argv)
{
   printf("Runing 1D grid \n");
   int size = 1 << 22;
   int block_size = 128;

   int nx, ny = 0;

   if (argc > 2)
      size = 1 << atoi(argv[2]);

   if (argc > 4)
      block_size = 1 << atoi(argv[4]);


   unsigned int byte_size = size * sizeof(float);

   printf("Input size : %d \n",size);

   float * h_a, *h_b, *h_out, *h_ref;
   h_a = (float*)malloc(byte_size);
   h_b = (float*)malloc(byte_size);
   h_out = (float*)malloc(byte_size);
   h_ref = (float*)malloc(byte_size);


   if (!h_a)
      printf("host memory allocation error \n");

  for (size_t i = 0; i < size; i  )
   {
      h_a[i] = i % 10;
      h_b[i] = i % 7;
   }

   clock_t cpu_start, cpu_end;
   cpu_start = clock();
   sum_array_cpu(h_a, h_b, h_out,size);
   cpu_end   = clock();

   dim3 block( block_size);
   dim3 grid((size block.x -1)/block.x);

   printf("Kernel is lauch with grid(%d,%d,%d) and block(%d,%d,%d) \n",
      grid.x,grid.y,grid.z,block.x,block.y, block.z);

   float *d_a, *d_b, *d_c;

   gpuErrchk(cudaMalloc((void**)&d_a, byte_size));
   gpuErrchk(cudaMalloc((void**)&d_b, byte_size));
   gpuErrchk(cudaMalloc((void**)&d_c, byte_size));
   gpuErrchk(cudaMemset(d_c,0,byte_size));

   clock_t htod_start, htod_end;
   htod_start = clock();
   gpuErrchk(cudaMemcpy(d_a,h_a,byte_size,cudaMemcpyHostToDevice));
   gpuErrchk(cudaMemcpy(d_b, h_b, byte_size, cudaMemcpyHostToDevice));
   htod_end = clock();

   clock_t gpu_start, gpu_end;
   gpu_start = clock();
   sum_arrays_1Dgrid_1Dblock << <grid, block >> > (d_a, d_b, d_c, size);
   gpuErrchk(cudaDeviceSynchronize());
   gpu_end   = clock();

   clock_t dtoh_start, dtoh_end;
   dtoh_start = clock();
   gpuErrchk(cudaMemcpy(h_ref,d_c,byte_size,cudaMemcpyDeviceToHost));
   dtoh_end   = clock();

   compare_arrays(h_out, h_ref,size);

     // elapsed time comparison
   printf("Sum array CPU execution time [ms] : %4.6f \n",
         (double)((double)1000.0*(cpu_end - cpu_start)/CLOCKS_PER_SEC));
   printf("Sum array GPU execution time [ms] : %4.6f \n",
         (double)((double)1000.0*(gpu_end - gpu_start)/CLOCKS_PER_SEC));
   printf("htod mem transfer time [ms] : %4.6f \n",
         (double)((double)1000.0*(htod_end - htod_start)/CLOCKS_PER_SEC));
   printf("dtoh mem transfer time [ms] : %4.6f \n",
        (double)((double)1000.0*(dtoh_end - dtoh_start)/CLOCKS_PER_SEC));
   printf("Total GPU execution time [ms] : %4.6f \n",
         (double)((double)1000.0*(dtoh_end - htod_start)/CLOCKS_PER_SEC));

   cudaFree(d_c);
   cudaFree(d_b);
   cudaFree(d_a);
   free(h_ref);
   free(h_out);
   free(h_b);
   free(h_a);
}

////arguments :
////1 - kernel (0:1D or 1:2D), 
////2 - input size (2 pow (x))
////3 - for 2D kernel nx, 
////4 - block.x 
////5 - block.y  
int main(int argc, char** argv)
{
   printf("\n----------------------- SUM ARRAY EXAMPLE FOR NVPROF ------------------------ \n\n");
   printf("argc : %d \n",argc);
   for (int i = 0; i < argc; i  )
   {
      printf("argv : %s \n",argv[i]);
   };

   run_sum_array_1d(argc, argv);

   //query_device();
   return 0;
}

Fortran code below

#include 'Error.fpp'
MODULE CUDAOps
   USE cudafor
   USE CUDAUtils
   USE CPUOps
   IMPLICIT NONE

   CONTAINS

   ATTRIBUTES(GLOBAL) SUBROUTINE sumArraysGPU_1D(a,b,c,Nsize)
      IMPLICIT NONE

      !> intent variables
      INTEGER, INTENT(IN),    DIMENSION(:) :: a,b
      INTEGER, INTENT(INOUT), DIMENSION(:) :: c
      INTEGER, INTENT(IN),    VALUE        :: Nsize

      !> local variables
      INTEGER :: blockId, threadId

      ! get the blockId
      blockId = (blockIdx%x-1)

      ! get the threadId
      threadId = blockId * blockDim%x   threadIdx%x-1

      ! adjust to let the threadId to start from 1
      threadId = threadId   1


      !WRITE(*,*) 'threadId = ',threadId

      ! set the maximum
      IF (threadId <= Nsize) THEN

         ! perform the sum
         c(threadId) = a(threadId)   b(threadId)
      END IF

  END SUBROUTINE sumArraysGPU_1D

 SUBROUTINE runSumArrays1D(xpow,blockSizeX)
     IMPLICIT NONE

     ! intent variables
     INTEGER, INTENT(IN) :: xpow,blockSizeX

     !> variables declaration
     ! size of the arrays
     INTEGER:: Nsize
     ! size of the GPU block
     INTEGER:: block_size

     ! other auxiliary variables
     INTEGER          :: i,j,istat
     REAL(KIND=wp)    :: t1,t2,time,timeGPU
     TYPE(cudaEvent)  :: startEvent, stopEvent

     ! host data allocation
     INTEGER, DIMENSION(:), ALLOCATABLE :: h_a, h_b, h_c, gpu_results
     ! device data allocation
     INTEGER, DIMENSION(:), ALLOCATABLE, DEVICE :: d_a, d_b, d_c

     ! define the GPU grid and block
     TYPE(DIM3)            :: grid, tBlock

     ! define data size and block size along X dimension
     Nsize = 2**xpow
     block_size = 2**blockSizeX

     ! allocate memory in host
     ALLOCATE(h_a(Nsize))
     ALLOCATE(h_b(Nsize))
     ALLOCATE(h_c(Nsize))

     ! allocate memory in device
     ALLOCATE(gpu_results(Nsize))
     ALLOCATE(d_a(Nsize))
     ALLOCATE(d_b(Nsize))
     ALLOCATE(d_c(Nsize))

     ! define block and grid
     tBlock = DIM3(block_size,1,1)
     grid   = DIM3((Nsize/tBlock%x),1,1)

     ! host data initialization
     CALL generateNumberByIntegerDivision(h_a,10,Nsize)
     CALL generateNumberByIntegerDivision(h_b,7,Nsize)

     WRITE(*,*) 'Kernel is going to be launched with'
     WRITE(*,*) 'Nsize = ',Nsize
     WRITE(*,*) 'xpow = ',xpow
     WRITE(*,*) 'blockSizeX = ',blockSizeX
     WRITE(*,*) 'block_size = ',block_size
     WRITE(*,*) 'grid.x = ',grid%x
     WRITE(*,*) 'grid.y = ',grid%y
     WRITE(*,*) 'grid.z = ',grid%z
     WRITE(*,*) 'block.x = ',tblock%x
     WRITE(*,*) 'block.y = ',tblock%y
     WRITE(*,*) 'block.z = ',tblock%z
     timeGPU = 0.0_wp


     CALL CPU_TIME(t1)
     ! perform the sum in serial using the CPU
     CALL sumArraysCPU(h_a,h_b,h_c)
     CALL CPU_TIME(t2)
     WRITE(*,*) 'time for the CPU implementation (ms) = ',(t2-t1)*1e3

     ! initialize CUDA events
     !istat = cudaEventCreate(startEvent)
     GPU_ERROR(cudaEventCreate(startEvent))
     istat = cudaEventCreate(stopEvent)

     ! copy the source data h_a from CPU to GPU
     istat = cudaEventRecord(startEvent,0)
     istat = cudaMemCpy(d_a,h_a,Nsize,cudaMemcpyHostToDevice)
     istat = cudaEventRecord(stopEvent,0)
     istat = cudaEventSynchronize(stopEvent)
     istat = cudaEventElapsedTime(time, startEvent, stopEvent)
     WRITE(*,*) 'time to transfer h_a to GPU (ms) = ',time
     timeGPU = timeGPU   time

     ! copy the source data h_b from CPU to GPU
     istat = cudaEventRecord(startEvent,0)
     istat = cudaMemCpy(d_b,h_b,Nsize,cudaMemcpyHostToDevice)
     istat = cudaEventRecord(stopEvent,0)
     istat = cudaEventSynchronize(stopEvent)
     istat = cudaEventElapsedTime(time, startEvent, stopEvent)
     WRITE(*,*) 'time to transfer h_b to GPU (ms) = ',time
     timeGPU = timeGPU   time

     ! perform the sum on the GPU
     istat = cudaEventRecord(startEvent,0)
     CALL sumArraysGPU_1D<<<grid, tBlock>>>(d_a,d_b,d_c,Nsize)
     istat = cudaEventRecord(stopEvent,0)
     istat = cudaEventSynchronize(stopEvent)
     istat = cudaEventElapsedTime(time, startEvent, stopEvent)
     WRITE(*,*) 'time to perform the sum on GPU (ms) = ',time
     timeGPU = timeGPU   time

     ! copy the data back from GPU to CPU
     istat = cudaEventRecord(startEvent,0)
     istat = cudaMemCpy(gpu_results,d_c,Nsize,cudaMemcpyDeviceToHost)
     istat = cudaEventRecord(stopEvent,0)
     istat = cudaEventSynchronize(stopEvent)
     istat = cudaEventElapsedTime(time, startEvent, stopEvent)
     WRITE(*,*) 'time to copy back data from GPU to CPU (ms) = ',time
     timeGPU = timeGPU   time
     WRITE(*,*) 'Total time to execute GPU (ms) :',timeGPU

     !WRITE(*,*) 'h_c = ',h_c
     !WRITE(*,*) 'gpu_results = ',gpu_results
     ! make a formal check of the result component by component
     CALL checkArraysCPU(h_c,gpu_results,Nsize)
     WRITE(*,*) 'SUM(h_c) = ',SUM(h_c)
     WRITE(*,*) 'SUM(gpu_results) = ',SUM(gpu_results)

     ! deallocate memory in host
     DEALLOCATE(h_a)
     DEALLOCATE(h_b)
     DEALLOCATE(h_c)

     ! deallocate memory in device
     DEALLOCATE(gpu_results)
     DEALLOCATE(d_a)
     DEALLOCATE(d_b)
     DEALLOCATE(d_c)

  END SUBROUTINE runSumArrays1D

PROGRAM main
   USE CPUOps
   USE CUDAOps

   IMPLICIT NONE

   ! declare local variables
   INTEGER :: i,xpow,sizeBlockX

   ! set the default values
   xpow       = 22
   sizeBlockX = 7

  ! lanuch the dedicated routines
  CALL runSumArrays1D(xpow,sizeBlockX)
STOP
END PROGRAM main

When I run the code with the default options (data size and block size) using nvprof, using this command for both codes:

nvprof ./code.x

I get the following output.

for the C code:

----------------------- SUM ARRAY EXAMPLE FOR NVPROF ------------------------ 

Runing 1D grid 
Input size : 4194304 
Kernel is lauch with grid(32768,1,1) and block(128,1,1) 
==33351== NVPROF is profiling process 33351, command: ./code_c.x
Arrays are same 
Sum array CPU execution time [ms] : 4.850000 
Sum array GPU execution time [ms] : 1.610000 
htod mem transfer time [ms] : 10.640000 
dtoh mem transfer time [ms] : 5.759000 
Total GPU execution time [ms] : 18.011000 
==33351== Profiling application: ./code_c.x
==33351== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   61.35%  10.715ms         2  5.3577ms  5.3566ms  5.3589ms  [CUDA memcpy HtoD]
                   30.94%  5.4040ms         1  5.4040ms  5.4040ms  5.4040ms  [CUDA memcpy DtoH]
                    5.81%  1.0147ms         1  1.0147ms  1.0147ms  1.0147ms  sum_arrays_1Dgrid_1Dblock(float*, float*, float*, int)
                    1.90%  331.81us         1  331.81us  331.81us  331.81us  [CUDA memset]
      API calls:   75.67%  60.242ms         3  20.081ms  55.398us  60.116ms  cudaMalloc
                   20.59%  16.393ms         3  5.4645ms  5.2016ms  5.7578ms  cudaMemcpy
                    2.00%  1.5906ms         1  1.5906ms  1.5906ms  1.5906ms  cudaDeviceSynchronize
                    1.47%  1.1673ms         3  389.10us  186.65us  497.81us  cudaFree
                    0.14%  107.71us       101  1.0660us      88ns  57.578us  cuDeviceGetAttribute
                    0.08%  65.483us         1  65.483us  65.483us  65.483us  cuDeviceGetName
                    0.02%  17.946us         1  17.946us  17.946us  17.946us  cudaMemset
                    0.02%  16.011us         1  16.011us  16.011us  16.011us  cudaLaunchKernel
                    0.01%  8.6300us         1  8.6300us  8.6300us  8.6300us  cuDeviceGetPCIBusId
                    0.00%  1.1600us         3     386ns     146ns     846ns  cuDeviceGetCount
                    0.00%     369ns         2     184ns      94ns     275ns  cuDeviceGet
                    0.00%     246ns         1     246ns     246ns     246ns  cuDeviceTotalMem
                    0.00%     194ns         1     194ns     194ns     194ns  cuModuleGetLoadingMode
                    0.00%     167ns         1     167ns     167ns     167ns  cuDeviceGetUuid

for the Fortran code:

==38266== NVPROF is profiling process 38266, command: ./code_f.x 
 Kernel is going to be launched with
 Nsize =       4194304
 xpow =            22
 blockSizeX =             7
 block_size =           128
 grid.x =         32768
 grid.y =             1
 grid.z =             1
 block.x =           128
 block.y =             1
 block.z =             1
 time for the CPU implementation (ms) =     4.997969    
 time to transfer h_a to GPU (ms) =     5.680192    
 time to transfer h_b to GPU (ms) =     5.561248    
 time to perform the sum on GPU (ms) =     1.510400    
 time to copy back data from GPU to CPU (ms) =     7.039712    
 Total time to execute GPU (ms) :    19.79155    
 Arrays are the same!
 SUM(h_c) =    1592097881
 SUM(gpu_results) =    1592097881
==38266== Profiling application: ./code_f.x 
==38266== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   58.75%  10.911ms         5  2.1822ms  1.2160us  5.4682ms  [CUDA memcpy HtoD]
                   35.16%  6.5297ms         1  6.5297ms  6.5297ms  6.5297ms  [CUDA memcpy DtoH]
                    6.10%  1.1321ms         1  1.1321ms  1.1321ms  1.1321ms  cudaops_sumarraysgpu_1d_
      API calls:   87.80%  150.69ms         6  25.115ms  2.5020us  150.30ms  cudaMalloc
                    9.95%  17.072ms         6  2.8454ms  4.1870us  7.0309ms  cudaMemcpy
                    1.39%  2.3788ms         6  396.47us  2.2640us  1.1368ms  cudaFree
                    0.72%  1.2281ms         4  307.02us  6.6590us  629.72us  cudaEventSynchronize
                    0.05%  93.254us       101     923ns      92ns  41.961us  cuDeviceGetAttribute
                    0.04%  64.982us         1  64.982us  64.982us  64.982us  cuDeviceGetName
                    0.02%  36.395us         8  4.5490us  1.1180us  13.299us  cudaEventRecord
                    0.02%  31.801us         2  15.900us     873ns  30.928us  cudaEventCreate
                    0.01%  18.638us         1  18.638us  18.638us  18.638us  cudaLaunchKernel
                    0.00%  6.3520us         4  1.5880us     970ns  2.5790us  cudaEventElapsedTime
                    0.00%  4.9980us         1  4.9980us  4.9980us  4.9980us  cuDeviceGetPCIBusId
                    0.00%  1.5290us         3     509ns     165ns  1.1890us  cuDeviceGetCount
                    0.00%     444ns         2     222ns      92ns     352ns  cuDeviceGet
                    0.00%     279ns         1     279ns     279ns     279ns  cuModuleGetLoadingMode
                    0.00%     248ns         1     248ns     248ns     248ns  cuDeviceTotalMem
                    0.00%     164ns         1     164ns     164ns     164ns  cuDeviceGetUuid

What I am trying to understand here is why the number of "cudaMalloc", "cudaMemcpy" and "cudaFree" calls are in agreement with what I write in the C code, whereas they are not in the Fortran code. Specifically, while I perform the allocation of 3 arrays, cudaMalloc shows that I am calling it 6 times?

I am trying to understand whether there is a mistake/bug in my Fortran code or this is normal and if yes, for what reason. Thank you.

I have tried to play with the allocation statements of d_a, d_b and d_c arrays in Fortran. What it looks like is that the kernel call sort of does the cudaMalloc and memory copy again on top of the calls already done explicitly.

----------------- EDIT An additional question is. If I print some nvprof specific metrics, such as:

nvprof --metrics gld_efficiency,sm_efficiency,achieved_occupancy ./code.x

this is the C output:

Invocations                               Metric Name                        Metric Description         Min         Max         Avg
Device "NVIDIA GeForce MX330 (0)"
    Kernel: sum_arrays_1Dgrid_1Dblock(float*, float*, float*, int)
          1                            gld_efficiency             Global Memory Load Efficiency     100.00%     100.00%     100.00%
          1                             sm_efficiency                   Multiprocessor Activity      99.50%      99.50%      99.50%
          1                        achieved_occupancy                        Achieved Occupancy    0.922875    0.922875    0.922875

and this is the Fortran output

Invocations                               Metric Name                        Metric Description         Min         Max         Avg
Device "NVIDIA GeForce MX330 (0)"
    Kernel: cudaops_sumarraysgpu_1d_
          1                            gld_efficiency             Global Memory Load Efficiency      67.86%      67.86%      67.86%
          1                             sm_efficiency                   Multiprocessor Activity      99.62%      99.62%      99.62%
          1                        achieved_occupancy                        Achieved Occupancy    0.877743    0.877743    0.877743

You can clearly see a difference in the Global Memory Load Efficiency. Are the two issues related?

CodePudding user response：

In CUDA, at least, a fortran array has metadata that is needed by (CUDA) fortran device code generation, at least.

This metadata results in 2 allocations per fortran array. One for the actual data. One for the metadata.

One example of what metadata might be is "width" of the array. (I'm using the term fortran array loosely here. You won't always witness this metadata for any sort of device allocation.)

Naturally, since the metadata is needed (in this case, as determined by CUDA Fortran compiler), and allocated separately, you are going to witness 2 copy operations per array as well. And, correspondingly, 2 free operations as well.