0

I have made a parallel version of a heat transfer simulation program with Open MP in Fortran 95, and it seems to work (results are equal). However, the time elapsed ("wall clock" time) is the same as the sequential version. My default number of threads is 4, but even if I change it, the computing time is around the same... I am new with Open MP and parallelization too, so I am struggling to understand where are my errors. Maybe you could help me to find them?

Edit:

Here is my code

program heat
    use omp_lib 
    implicit none

! Variables
    real,PARAMETER :: alpha = 1e-02 ! heat diffusive coefficient
    integer,PARAMETER ::  s = 10 ! Kelvin/s
    integer,PARAMETER ::  L = 1 ! cubic side length
    real,PARAMETER :: dt = 1e-03   ! time step
    real,PARAMETER :: dx = 1e-02   ! space step 
    real,PARAMETER :: dy = 1e-02   ! space step 
    real,PARAMETER :: dz = 1e-02   ! space step
    integer,PARAMETER :: N = int(L/dx)   ! Finite volumes
    integer,PARAMETER :: t_obs = 120   ! secondes
    integer,PARAMETER :: ite = int(t_obs/dt)   ! iterations
    real :: c = (alpha*dt)/(dx**2)  ! scheme stability criteria
    integer :: total_length = (N+2)  

    real,ALLOCATABLE,DIMENSION(:,:,:) :: T
    ! Boundaries conditions
    real :: T_R = 298   ! right 298
    real :: T_L = 298   ! left 298
    real :: T_F = 323   ! front 323
    real :: T_B = 323   ! back 323
    real :: T_U = 373   ! up 373
    real :: T_D = 373   ! down 373
    integer :: i,j,k,iteration

    
    print*, "c = ",c

    ALLOCATE(T(total_length,total_length,total_length))

 ! Initial condition
    do i = 1,N+2
        do j = 1, N+2
            do k = 1, N+2
                
                if (j == 1 .and. k /= 1 .and. k /= N+2) then
                    T(i,k,j) = T_F 
                else if (i == N+2 .and. j /= N+2 .and. k /= N+2 .and. k /= 1) then
                    T(i,k,j) = T_R 
                else if (i == 1 .and. j /= 1 .and. k /= 1 .and. k /= N+2) then 
                    T(i,k,j) = T_L 
                else if (i /= 1 .and. j == N+2 .and. k /= 1 .and. k /= N+2) then
                    T(i,k,j) = T_B 
                else if (k == N+2) then 
                    T(i,k,j) = T_U 
                else if (k == 1) then 
                    T(i,k,j) = T_D 
                else
                    T(i,k,j) = 283
                end if 
                
            end do 
        end do 
    end do 



 ! Datas in file:

    open(10,file='para_heat_3D_unsteady2.dat',FORM = 'UNFORMATTED')
    

 ! Iterative loop
!$omp parallel default(shared) private(i,j,k)
    do iteration = 0,15
        !$omp single
        print*, iteration 
        !$omp end single
            
                if (iteration == 0) then
                !$omp single
                    i = 1
                    do while (i < N+3) 

                        do j = 1, N+2
                                WRITE(10) T(i,:,j)
                               ! WRITE(10,*) T(i,:,j) ! formatted
                        end do
                        i = i + 1
                    end do
                !$omp end single 
                else
                ! Boundary Conditions
                    T(:,:,1) = T_F
                    T(N+2,:,:) = T_R
                    T(1,:,:) = T_L
                    T(:,:,N+2) = T_B
                    T(:,N+2,:) = T_U
                    T(:,1,:) = T_D

                    !$omp do 
                    do i = 2, N+1
                        do j = 2, N+1
                            do k = 2, N+1 
                                if (i == N+1 .and. j == N+1 .and. k == N+1) then
                                    T(i,k,j) = T(i,k,j) * (1-9*c) + T_R * (2*c) + T(i-1,k,j)*c + T_B *(2*c) + c*T(i,k,j-1) + & 
                                    2*c*T_U + c*T(i,k-1,j) + s*dt
                                else if (i == 2 .and. j == N+1 .and. k == N+1) then
                                    T(i,k,j) = T(i,k,j) * (1-9*c) + T_L * (2*c) + c*T(i+1,k,j) + 2*c*T_B + c*T(i,k,j-1) + 2*c*T_U &
                                    + c*T(i,k-1,j) + s*dt
                                else if (i == N+1 .and. j == N+1 .and. k == 2) then
                                    T(i,k,j) = T(i,k,j) * (1-9*c) + 2*c*T_R + c*T(i-1,k,j) + 2*c*T_B  + c*T(i,k,j-1) + 2*c*T_D + &
                                    c*T(i,k+1,j) + s*dt 
                                else if (i == 2 .and. j == N+1 .and. k == 2) then
                                    T(i,k,j) = T(i,k,j) * (1-9*c) + 2*c*T_L + c*T(i+1,k,j) + 2*c*T_B + c*T(i,k,j-1) + 2*c*T_D + & 
                                    c*T(i,k+1,j) + s*dt 
                                else if (i /= 1 .and. i /= N+2 .and. j == 2 .and. k == N+1) then
                                    T(i,k,j) = T(i,k,j) * (1-8*c) + c*T(i+1,k,j) + c*T(i-1,k,j) + c*T(i,k,j+1) + 2*c*T_F + 2*c*T_U &
                                    + c*T(i,k-1,j) + s*dt 
                                else if (i == 2 .and. j == 2 .and. k /= 1 .and. k /= N+2) then
                                    T(i,k,j) = T(i,k,j) * (1-8*c) + c*T(i+1,k,j) + 2*c*T_L + c*T(i,k,j+1) + 2*c*T_F + &
                                    c*(T(i,k+1,j) + T(i,k-1,j)) + s*dt
                                else if (i /= 1 .and. i /= N+2 .and. j == 2 .and. k == 2) then
                                    T(i,k,j) = T(i,k,j) * (1-8*c) + c*T(i-1,k,j) + c*T(i+1,k,j) + c*T(i,k,j+1) + 2*c*T_F + &
                                    c*T(i,k+1,j) + 2*c*T_D + s*dt
                                else if (i == N+1 .and. j == 2 .and. k /= 1 .and. k /= N+2) then
                                    T(i,k,j) = T(i,k,j) * (1-8*c) + 2*c*T_R + c*T(i-1,k,j) + c*T(i,k,j+1) + 2*c*T_F + &
                                    c*(T(i,k+1,j) + T(i,k-1,j)) + s*dt 
                                else if (i == N+1 .and.  j /= 1 .and. j /= N+2 .and. k == 2) then
                                    T(i,k,j) = T(i,k,j) * (1-8*c) + 2*c*T_R + c*T(i-1,k,j) + c*(T(i,k,j+1) + T(i,k,j-1)) + &
                                    c*T(i,k+1,j) + 2*c*T_D + s*dt 
                                else if (i == N+1 .and. j == N+1 .and. k /= 1 .and. k /= N+2) then
                                    T(i,k,j) = T(i,k,j) * (1-8*c) + 2*c*T_R + c*T(i-1,k,j) + 2*c*T_B + c*T(i,k,j-1) + c*T(i,k+1,j) &
                                    + c*T(i,k-1,j) + s*dt 
                                else if (i == N+1 .and. j /= 1 .and. j /= N+2 .and. k == N+1) then
                                    T(i,k,j) = T(i,k,j) * (1-8*c) + 2*c*T_R + c*T(i-1,k,j) + c*T(i,k,j+1) + c*T(i,k,j-1) + 2*c*T_U &
                                    + c*T(i,k-1,j) + s*dt 
                                else if (j == N+1 .and. k == N+1 .and. i /= 1 .and. i /= N+2) then
                                    T(i,k,j) = T(i,k,j) * (1-8*c) + c*T(i+1,k,j) + c*T(i-1,k,j) + 2*c*T_B + c*T(i,k,j-1) + 2*c*T_U &
                                    + c*T(i,k-1,j) + s*dt 
                                else if (i == 2 .and. k == N+1 .and. j /= 1 .and. j /= N+2) then
                                    T(i,k,j) = T(i,k,j) * (1-8*c) + 2*c*T_L + c*T(i+1,k,j) + 2*c*T_U + c*T(i,k,j-1)+ c*T(i,k,j+1) &
                                    + c*T(i,k-1,j) + s*dt
                                else if (i == 2 .and. j == N+1 .and. k /= 1 .and. k /= N+2) then 
                                    T(i,k,j) = T(i,k,j) * (1-8*c) + 2*c*T_L + c*T(i+1,k,j) + 2*c*T_B + c*T(i,k,j-1) + c*T(i,k+1,j) &
                                    + c*T(i,k-1,j) + s*dt 
                                else if (k == 2 .and. j == N+1 .and. i /= 1 .and. i /= N+2) then
                                    T(i,k,j) = T(i,k,j) * (1-8*c) + c*T(i+1,k,j) + c*T(i-1,k,j) + 2*c*T_B + c*T(i,k,j-1) &
                                    + c*T(i,k+1,j) + 2*c*T_D + s*dt 
                                else if (i == 2 .and. k == 2 .and. j /= 1 .and. j /= N+2) then 
                                    T(i,k,j) = T(i,k,j) * (1-8*c) + 2*c*T_L + c*T(i+1,k,j) + c*T(i,k,j+1) + c*T(i,k,j-1) + &
                                    c*T(i,k+1,j) + 2*c*T_D + s*dt
                                else if (i == N+1 .and. j == 2 .and. k == N+1) then
                                    T(i,k,j) = T(i,k,j) * (1-9*c) + c*T(i+1,k,j) + 2*c*T_L + c*T(i,k,j+1) + 2*c*T_F + c*T(i,k-1,j) &
                                    + 2*c*T_U + s*dt 
                                else if (i == N+1 .and. j == 2 .and. k == 2) then
                                    T(i,k,j) = T(i,k,j) * (1-9*c) + c*T(i-1,k,j) + 2*c*T_R + c*T(i,k,j+1) + 2*c*T_F + c*T(i,k-1,j) &
                                    + 2*c*T_U + s*dt 
                                else if (i == 2 .and. j == 2 .and. k == 2) then
                                    T(i,k,j) = T(i,k,j) * (1-9*c) + c*T(i+1,k,j) + 2*c*T_L + c*T(i,k,j+1) + 2*c*T_F + 2*c*T_D + &
                                    c*T(i,k+1,j) + s*dt 
                                else if (i == N+1 .and. j == 2 .and. k == 2) then 
                                    T(i,k,j) = T(i,k,j) * (1-9*c) + c*T(i-1,k,j) + 2*c*T_R + c*T(i,k,j+1) + 2*c*T_F + 2*c*T_D + &
                                    c*T(i,k+1,j) + s*dt
                                else
                                    T(i,k,j) = T(i,k,j) * (1-6*c) + c*T(i-1,k,j) + c*T(i+1,k,j) + c*T(i,k,j+1) + c*T(i,k,j-1) + &
                                    c*T(i,k-1,j) + c*T(i,k+1,j) + s*dt 
                                end if 
                            end do 
                        end do 
                    end do 
                    !$omp end do

                ! Print in a file T values 
                !$omp single
                    i = 1
                    do while (i < N+3) 
                        do j = 1, N+2
                            WRITE(10) T(i,:,j)
                            ! WRITE(10,*) T(i,:,j) ! formatted
                        end do
                        i = i + 1 
                    end do 
                !$omp end single
                end if
    end do 
!$omp end parallel 
    


    DEALLOCATE(T)

end program heat



So for these 15 iterations, with the formatted way the time elapsed is now around 16 seconds, whatever the number of threads, and is a little bit less than sequential (around 21 sec). With the unformatted way it is highly faster (around 1 sec) but I receive symbols like "�.." (maybe binary ?) and I don't know how to post-treat it and plot something with it... I compile it with: gfortran -fopenmp -g -fcheck=all -Wall para_heat_3D_unsteady.f95 I have no environment variables set.

Jejouze
  • 23
  • 6
  • 3
    Please show us a complete program, and how you compile it, and *exactly* how you run it, including setting any environment variables if necessary - my suspicion is you spend your whole time writing the results to file rather than any threaded work, but without a complete example it is very difficult to tell. – Ian Bush Oct 12 '21 at 17:28
  • 1
    Also why on earth is T not a 3 dimensional array? Also those if conditions in the loop are horrible and will kill performance - consider using something like a halo to avoid it. – Ian Bush Oct 12 '21 at 17:28
  • 1
    How do you measure the time? Please understand that *reproducible* means that others can reproduce it as well. That means that the code must be complete and compilable. – Vladimir F Героям слава Oct 12 '21 at 17:48
  • 1
    They're sometimes known as ghost cells. It looks like the program is a finite difference equation in 3D using periodic boundary conditions, so instead of a whole load of if( i am on the edge ) stuff in the main loop, just make the array bigger by one in each direction, and before the main loop copy into this surface layer the required values for the finite difference. Then just loop over the domain - no ifs are required. You have an overhead proportional to the surface area of the domain, but if the domain is big this is negligible. – Ian Bush Oct 12 '21 at 19:40
  • Saying "computing time" suggests that you are measuring CPU time (which is very unlikely to reduce from introducing parallelism since you have to do the same amount of work), rather than elapsed, "wall clock" time, which you would hope can be reduced by using parallelism. – Jim Cownie Oct 13 '21 at 08:17
  • Thanks for all your comments. I edited my post with the complete program and with info about compilation and execution. Indeed, I am interested in the time elapsed ("wall clock" time) and not the computing time actually... As the execution is fast enough with only 15 iterations, I measure the time elapsed with a simple Chrono app... I tried with the function Call Cpu_time but it seems that it is not the same time, and in Fortran, I don't know exactly how to deal with the time elapsed. – Jejouze Oct 16 '21 at 14:13
  • Equations are finite volume in 3D @IanBush. I don't know how to do with ghost cells, but according to what I understood, it can be an optimization for my code, but not only for the parallel case, the sequential case too. So it doesn't explain why I have about the same elapsed time between parallel and sequential versions .. ? I am waiting for your answers, thanks in advance. – Jejouze Oct 16 '21 at 14:14
  • We have several questions and answers about measuring execution time in parallel. `CPU_TIME()` measures the sum of all threads and you do *not* want it here. Better is the OpenMP `omp_get_wtime()` or Fortran `system_clock()`. – Vladimir F Героям слава Oct 16 '21 at 15:16
  • Also, I would always use compiler optimizations,,e.g., `-O3`. – Vladimir F Героям слава Oct 16 '21 at 15:18
  • 1
    At least on my machine you are spending 90%+ in the I/O. This is single threaded. That is why you are seeing no speed up. The "work" loop itself speeds up from 0.26s to 0.1s on going from 1 to 4 threads, but the I/O time stays constant at around 9.6s – Ian Bush Oct 16 '21 at 16:48
  • 1
    Note also you have a race condition in the way the threads write to T - assuming I have understood your code correctly. Certainly I get different results with differing numbers of threads. – Ian Bush Oct 16 '21 at 17:19
  • Thank you @IanBush for your explanation of the problem's source, but I don't understand how to resolve it and speed up also the I/O ;..? Thank you also VladimirF for your help in measuring the time. – Jejouze Oct 17 '21 at 09:45
  • Your I/O is very, very slow because you are using formatted I/O to write one element. Ideally you want to write may elements at a time unformatted. What you are doing at the moment is about the lesat efficient way possible. – Ian Bush Oct 17 '21 at 11:01
  • So the only way to resolve this is even not about parallelism, but I just have to remove the formatted I/O? – Jejouze Oct 17 '21 at 13:57
  • Any improvement to the I/O would be beneficial - note you can't just "get rid off the formatted I/O" as you can't do non-advancing unformatted I/O. I would write an answer suggesting something, but the confusing way you have written it with a 1D array for T means it would be more effort than I am willing to put in. I suggest in future if you have a 3 dimensional object you represent it with a 3 dimensional array. But before you do any of that fix the program so it give the correct answers - that is infinitely more important than any speed considerations. – Ian Bush Oct 17 '21 at 14:15
  • https://stackoverflow.com/questions/43637321/efficiency-in-writing-to-disk-in-fortran/43639246#43639246 might be of use – Ian Bush Oct 17 '21 at 14:26
  • After I ran my code with different numbers of threads (2,4,5,8), it seems that the results are equal and good. I have changed my T array to a 3D array and used your suggestion to print many elements at a time (1 complete direction). I tried to use unformatted I/O as you did in the previous post, but it gives me symbols like " �..." (maybe binary things?) but I don't know how to deal with that, if I want to post-treat data and plot something with it... ? Finally, I don't know how to use these ghost cells ..? But I changed the way I deal with boundaries. All things are in my edited post. – Jejouze Oct 22 '21 at 14:50
  • How I can improve my I/O performance since I changed my T array to a 3D array and print many elements at a time? – Jejouze Nov 02 '21 at 09:38

0 Answers0