I wrote a program for matrix product state in Fortran 90. In this program, I use the Intel MKL library. When the compiler option is:
OPTIMIZE= -parallel -par-threshold90 -ipo -O3 -no-prec-div -fp-model fast=2 -xHost
LinkLine = $(OPTIMIZE) $(DIAGNOSE) -mkl -static-intel,
in the runtime, the memory keeps increasing. If I change the compiler option to:
OPTIMIZE= -ipo -O3 -no-prec-div -fp-model fast=2 -xHost
LinkLine = $(OPTIMIZE) $(DIAGNOSE) -mkl=sequential -static-intel,
there is no problem, everything is fine. Why does this happen? Is there any solution to this?
Follow the suggestions by Alexander, I post one of the subroutines (performing singular value decomposition for a matrix) in the following:
SUBROUTINE SVD(Theta,S1,S2,lambda,Din,Dout)
IMPLICIT NONE
INTEGER, INTENT(IN) :: Din, Dout
COMPLEX(kind=rKind),DIMENSION(Din,Din), INTENT(IN) :: Theta
REAL(kind=rKind), INTENT(OUT) :: lambda(Dout)
COMPLEX(kind=rKind), INTENT(OUT) :: S1(Din,Dout),S2(Din,Dout)
COMPLEX(kind=rKind) :: M(Din,Din)
REAL(kind=rKind) :: temp_lam(Din)
COMPLEX(kind=rKind) :: temp_U(Din,Din), temp_V(Din,Din)
COMPLEX(kind=rKind) :: WORK(10*Din)
REAL(kind=rKind) :: RWORK(5*Din)
REAL(kind=rKind), PARAMETER :: Truncation = 1E-13_rKind
INTEGER :: INFO, i
!----------------------------
M = Theta
CALL ZGESVD('A','A',Din,Din,M,Din,temp_lam,temp_U,Din,temp_V,Din,WORK,10*Din,RWORK,INFO)
lambda = 0.0_rKind
S1 = 0.0_rKind
S2 = 0.0_rKind
DO i = 1,Dout
IF (temp_lam(i)/temp_lam(1)>Truncation) THEN
lambda(i) = temp_lam(i)
S1(:,i) = temp_U(:,i)
S2(:,i) = temp_V(i,:)
END IF
END DO
END SUBROUTINE SVD
And another subroutine is
SUBROUTINE TwoSiteUpdate(A_update,B_update,lambda_2_update,&
& A,B,lambda_1,lambda_2,U,D1,D2_old,D2_new)
! Designed for 1D MPS.
IMPLICIT NONE
INTEGER, INTENT(IN) :: D1, D2_old, D2_new
COMPLEX(kind=rKind), INTENT(IN) :: A(D1,D2_old,localSize), B(D2_old,D1,localSize)
REAL(kind=rKind), INTENT(IN) :: lambda_1(D1), lambda_2(D2_old)
COMPLEX(kind=rKind), INTENT(IN) :: U(localSize,localSize,localSize,localSize)
COMPLEX(kind=rKind), INTENT(OUT) :: A_update(D1,D2_new,localSize), B_update(D2_new,D1,localSize)
REAL(kind=rKind), INTENT(OUT) :: lambda_2_update(D2_new)
COMPLEX(kind=rKind) :: temp_A(D1,D2_old,localSize)
COMPLEX(kind=rKind) :: temp_B(D2_old,D1,localSize)
COMPLEX(kind=rKind) :: Theta(D1,localSize,D1,localSize)
COMPLEX(kind=rKind) :: Theta_S(D1*localSize,D1*localSize)
COMPLEX(kind=rKind) :: S1(D1*localSize,D2_new), S2(D1*localSize,D2_new)
COMPLEX(kind=rKind) :: S1_tmp(D1,localSize,D2_new)
COMPLEX(kind=rKind) :: S2_tmp(D1,localSize,D2_new)
REAL(kind=rKind), PARAMETER :: Truncation=1E-16_rKind
COMPLEX(kind=rKind) :: tmp_ele
INTEGER :: i1,i2,i3,im
INTEGER :: mA1,mB1,mA2,mB2
INTEGER :: i
!------------------- add lambda on A, B ----------------------
temp_A = 0.0_rKind
temp_B = 0.0_rKind
DO im = 1,localSize
DO i2 = 1,D2_old
DO i1 = 1,D1
IF(lambda_1(i1)>Truncation) THEN
temp_A(i1,i2,im) = A(i1,i2,im)*SQRT(lambda_1(i1))
temp_B(i2,i1,im) = B(i2,i1,im)*SQRT(lambda_1(i1))
END IF
END DO
END DO
END DO
!--------------- svd on Theta ----------------
Theta = 0.0_rKind
DO mB2 = 1,localSize
DO i2 = 1,D1
DO mA2 = 1,localSize
DO i1 = 1,D1
tmp_ele = 0.0_rKind
DO mB1 = 1,localSize
DO mA1 = 1,localSize
DO i3 = 1,D2_old
tmp_ele = tmp_ele + U(mA2,mB2,mA1,mB1)*temp_A(i1,i3,mA1)*temp_B(i3,i2,mB1)
END DO
END DO
END DO
Theta(i1,mA2,i2,mB2) = tmp_ele
END DO
END DO
END DO
END DO
Theta_S = RESHAPE(Theta,SHAPE=(/D1*localSize,D1*localSize/))
CALL SVD(Theta_S,S1,S2,lambda_2_update,D1*localSize,D2_new)
lambda_2_update = lambda_2_update/lambda_2_update(1)
S1_tmp = RESHAPE(S1,SHAPE=(/D1,localSize,D2_new/))
S2_tmp = RESHAPE(S2,SHAPE=(/D1,localSize,D2_new/))
!---------------- update A, B ----------------
A_update = 0.0_rKind
B_update = 0.0_rKind
DO im = 1,localSize
DO i2 = 1,D2_new
DO i1 = 1,D1
IF (lambda_1(i1)>Truncation) THEN
A_update(i1,i2,im) = S1_tmp(i1,im,i2)*SQRT(lambda_2_update(i2))/SQRT(lambda_1(i1))
B_update(i2,i1,im) = S2_tmp(i1,im,i2)*SQRT(lambda_2_update(i2))/SQRT(lambda_1(i1))
END IF
END DO
END DO
END DO
END SUBROUTINE TwoSiteUpdate
Above two subroutines are the time consuming part of the program. Without these two subroutines, the program is fine and the memory does not increase. With these two, the memory do increase when I use the parallel version.