to make things easy, here's the sample code ready to be compiled and run:
module elemWiseOps
USE cudafor
USE cublas
!
! Definition of symbols for real types (RP) and complex ones (CP)
!
IMPLICIT NONE
!
INTEGER, PARAMETER :: SP = SELECTED_REAL_KIND(6, 37) ! REAL32
INTEGER, PARAMETER :: DP = SELECTED_REAL_KIND(15, 307) ! REAL64
!
INTERFACE VMUL
MODULE PROCEDURE dVmul
END INTERFACE
INTERFACE VADD
MODULE PROCEDURE dVadd
END INTERFACE
Contains
attributes(global) subroutine dVmul(a, b)
!
IMPLICIT NONE
!
REAL(DP), DIMENSION(:,:), DEVICE, INTENT(in) :: a
REAL(DP), DIMENSION(:,:), DEVICE, INTENT(inout) :: b
!
! local variables
!
INTEGER, DEVICE :: i1, j, n(2)
i1 = (blockIdx%x-1)*blockDim%x + threadIdx%x
j = (blockIdx%y-1)*blockDim%y + threadIdx%y
n(1) = size(a,1)
n(2) = size(a,2)
if (i1<=n(1) .and. j<=n(2)) b(i1,j) = a(i1,j) * b(i1,j)
end subroutine dVmul
attributes(global) subroutine dVadd(a, b)
!
IMPLICIT NONE
!
REAL(DP), DIMENSION(:,:), DEVICE, INTENT(in) :: a
REAL(DP), DIMENSION(:,:), DEVICE, INTENT(inout) :: b
!
! local variables
!
INTEGER, DEVICE :: i1, j, n(2)
i1 = (blockIdx%x-1)*blockDim%x + threadIdx%x
j = (blockIdx%y-1)*blockDim%y + threadIdx%y
n(1) = size(a,1)
n(2) = size(a,2)
if (i1<=n(1) .and. j<=n(2)) b(i1,j) = a(i1,j) + b(i1,j)
end subroutine dVadd
attributes(global) subroutine dPrintV(a)
!
IMPLICIT NONE
!
REAL(DP), DIMENSION(:,:), DEVICE, INTENT(in) :: a
!
! local variables
!
INTEGER, DEVICE :: i1, j1, n(2)
n(1) = size(a,1)
n(2) = size(a,2)
DO j1 = 1, n(2)
Do i1 = 1, n(1)
print*, a(i1,j1)
ENDDO
ENDDO
end subroutine dPrintV
end module elemWiseOps
PROGRAM Test
!
! This is the main program for Test
!
USE cudafor
USE cublas
USE elemWiseOps
!
IMPLICIT NONE
!
REAL(DP), ALLOCATABLE, DIMENSION(:,:) :: a, b
REAL(DP), ALLOCATABLE, DEVICE, DIMENSION(:,:) :: a_d, b_d
!
INTEGER, PARAMETER :: m = 500, n = 1000
INTEGER :: i1, i2, istat
type(dim3) :: grid, tBlock
!
tBlock = dim3(32,32,1)
grid = dim3(ceiling(real(m)/tBlock%x), &
ceiling(real(n)/tBlock%y), 1)
!
! Allocate storage for the arrays
!
Allocate(a(m,n),b(m,n))
Allocate(a_d(m,n),b_d(m,n))
!
! Initialize the host arrays
!
Do i2 = 1, n
Do i1 = 1, m
a(i1, i2) = REAL(i1, DP)
b(i1, i2) = REAL(i1, DP)
Enddo
Enddo
!
! Copy to the device arrays
!
istat = cudaMemcpy2D(a_d, m, a, m, m, n)
istat = cudaMemcpy2D(b_d, m, b, m, m, n)
!!!
!
! Now invoke the kernels
!
Call vadd<<<grid,tBlock>>>(a_d, b_d)
Call vadd<<<grid,tBlock>>>(a_d, b_d)
Call vadd<<<grid,tBlock>>>(a_d, b_d)
!
! Free storage for the arrays
!
Deallocate(a,b)
Deallocate(a_d,b_d)
!
END PROGRAM Test
I compiled the code with
nvfortran -O3 -cuda -fast -gpu=cc60 -lcufft -lcublas -Minfo=accel test.f90
Then I profiled it with
nvprof --print-api-trace ./a.out
Here's what happened if i call the kernel 3 times in a row:
... ...
171.32ms 287.59ms cudaMalloc
458.94ms 98.290us cudaMalloc
462.69ms 816.76us cudaMemcpy2D
463.51ms 869.13us cudaMemcpy2D
464.39ms 134.45us cudaMalloc
464.52ms 12.703us cudaMemcpy
464.54ms 5.2460us cudaMalloc
464.54ms 4.1840us cudaMemcpy
464.55ms 52.608us cudaLaunchKernel (elemwiseops_dvadd_ [119])
464.61ms 123.66us cudaFree
464.73ms 78.753us cudaFree
464.81ms 107.87us cudaMalloc
464.92ms 6.6480us cudaMemcpy
464.92ms 3.7720us cudaMalloc
464.93ms 3.8070us cudaMemcpy
464.93ms 10.245us cudaLaunchKernel (elemwiseops_dvadd_ [126])
464.94ms 124.35us cudaFree
465.07ms 56.498us cudaFree
465.12ms 93.173us cudaMalloc
465.22ms 6.2260us cudaMemcpy
465.22ms 3.6010us cudaMalloc
465.23ms 3.6800us cudaMemcpy
465.23ms 7.6920us cudaLaunchKernel (elemwiseops_dvadd_ [133])
465.24ms 125.72us cudaFree
465.37ms 47.850us cudaFree
465.64ms 60.294us cudaFree
465.70ms 208.73us cudaFree
My question is, is it possible to avoid the overhead by 'cudaMalloc', 'cudaMemcpy' and 'cudaFree' when invoking a kernel in cuda Fortran? Many thanks!